KelSolaar commited on 6 days ago

Commit

fa06c67

0 Parent(s):

Initial commit.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
.gitignore +37 -0
.pre-commit-config.yaml +39 -0
LICENSE +11 -0
README.md +278 -0
docs/_static/gamma_sweep_plot.pdf +0 -0
docs/_static/gamma_sweep_plot.png +3 -0
docs/learning_munsell.md +478 -0
learning_munsell/__init__.py +7 -0
learning_munsell/analysis/__init__.py +1 -0
learning_munsell/analysis/error_analysis.py +304 -0
learning_munsell/comparison/from_xyY/__init__.py +1 -0
learning_munsell/comparison/from_xyY/compare_all_models.py +1292 -0
learning_munsell/comparison/from_xyY/compare_gamma_model.py +390 -0
learning_munsell/comparison/to_xyY/__init__.py +1 -0
learning_munsell/comparison/to_xyY/compare_all_models.py +617 -0
learning_munsell/data_generation/generate_training_data.py +310 -0
learning_munsell/interpolation/__init__.py +1 -0
learning_munsell/interpolation/from_xyY/__init__.py +43 -0
learning_munsell/interpolation/from_xyY/compare_methods.py +208 -0
learning_munsell/interpolation/from_xyY/delaunay_interpolator.py +283 -0
learning_munsell/interpolation/from_xyY/kdtree_interpolator.py +263 -0
learning_munsell/interpolation/from_xyY/rbf_interpolator.py +300 -0
learning_munsell/losses/__init__.py +17 -0
learning_munsell/losses/jax_delta_e.py +299 -0
learning_munsell/models/__init__.py +47 -0
learning_munsell/models/networks.py +1294 -0
learning_munsell/training/from_xyY/__init__.py +1 -0
learning_munsell/training/from_xyY/hyperparameter_search_error_predictor.py +503 -0
learning_munsell/training/from_xyY/hyperparameter_search_multi_head.py +541 -0
learning_munsell/training/from_xyY/hyperparameter_search_multi_head_error_predictor.py +552 -0
learning_munsell/training/from_xyY/hyperparameter_search_multi_mlp.py +471 -0
learning_munsell/training/from_xyY/refine_multi_head_real.py +358 -0
learning_munsell/training/from_xyY/train_deep_wide.py +371 -0
learning_munsell/training/from_xyY/train_ft_transformer.py +356 -0
learning_munsell/training/from_xyY/train_mixture_of_experts.py +620 -0
learning_munsell/training/from_xyY/train_mlp.py +269 -0
learning_munsell/training/from_xyY/train_mlp_attention.py +460 -0
learning_munsell/training/from_xyY/train_mlp_error_predictor.py +457 -0
learning_munsell/training/from_xyY/train_mlp_gamma.py +297 -0
learning_munsell/training/from_xyY/train_multi_head_3stage_error_predictor.py +411 -0
learning_munsell/training/from_xyY/train_multi_head_circular.py +479 -0
learning_munsell/training/from_xyY/train_multi_head_cross_attention_error_predictor.py +640 -0
learning_munsell/training/from_xyY/train_multi_head_gamma.py +300 -0
learning_munsell/training/from_xyY/train_multi_head_gamma_sweep.py +605 -0
learning_munsell/training/from_xyY/train_multi_head_large.py +246 -0
learning_munsell/training/from_xyY/train_multi_head_mlp.py +269 -0
learning_munsell/training/from_xyY/train_multi_head_multi_error_predictor.py +378 -0
learning_munsell/training/from_xyY/train_multi_head_multi_error_predictor_large.py +409 -0
learning_munsell/training/from_xyY/train_multi_head_st2084.py +313 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.onnx.data filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,37 @@

+# Common Files
+*.egg-info
+*.pyc
+*.pyo
+.DS_Store
+.coverage*
+uv.lock
+# Common Directories
+.fleet/
+.idea/
+.ipynb_checkpoints/
+.python-version
+.vs/
+.vscode/
+.sandbox/
+build/
+dist/
+docs/_build/
+docs/generated/
+node_modules/
+references/
+__pycache__
+.claude/settings.local.json
+.claude/scratchpad.md
+# Project Directories
+data/
+logs/
+mlartifacts/
+mlruns/
+mlruns.db
+reports/
+results/
+runs/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: "v5.0.0"
+    hooks:
+      - id: check-added-large-files
+      - id: check-case-conflict
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: check-yaml
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1
+    hooks:
+      - id: codespell
+        args: ["--ignore-words-list=colour"]
+  - repo: https://github.com/PyCQA/isort
+    rev: "6.0.1"
+    hooks:
+      - id: isort
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: "v0.12.4"
+    hooks:
+      - id: ruff-format
+      - id: ruff
+        args: [--fix]
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: "v4.0.0-alpha.8"
+    hooks:
+      - id: prettier
+  - repo: https://github.com/pre-commit/pygrep-hooks
+    rev: "v1.10.0"
+    hooks:
+      - id: rst-backticks
+      - id: rst-directive-colons
+      - id: rst-inline-touching-normal

LICENSE ADDED Viewed

	@@ -0,0 +1,11 @@

+Copyright 2025 Colour Developers
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE

README.md ADDED Viewed

	@@ -0,0 +1,278 @@

+---
+license: bsd-3-clause
+language:
+  - en
+tags:
+  - python
+  - colour
+  - color
+  - colour-science
+  - color-science
+  - colour-spaces
+  - color-spaces
+  - colourspace
+  - colorspace
+pipeline_tag: tabular-regression
+library_name: onnxruntime
+metrics:
+  - mae
+model-index:
+  - name: from_xyY (CIE xyY to Munsell)
+    results:
+      - task:
+          type: tabular-regression
+          name: CIE xyY to Munsell Specification
+        dataset:
+          name: CIE xyY to Munsell Specification
+          type: munsell-renotation
+        metrics:
+          - type: delta-e
+            value: 0.52
+            name: Delta-E CIE2000
+          - type: inference_time_ms
+            value: 0.089
+            name: Inference Time (ms/sample)
+  - name: to_xyY (Munsell to CIE xyY)
+    results:
+      - task:
+          type: tabular-regression
+          name: Munsell Specification to CIE xyY
+        dataset:
+          name: Munsell Specification to CIE xyY
+          type: munsell-renotation
+        metrics:
+          - type: delta-e
+            value: 0.48
+            name: Delta-E CIE2000
+          - type: inference_time_ms
+            value: 0.008
+            name: Inference Time (ms/sample)
+---
+# Learning Munsell - Machine Learning for Munsell Color Conversions
+A project implementing machine learning-based methods for bidirectional conversion between CIE xyY colourspace values and Munsell specifications.
+**Two Conversion Directions:**
+- **from_xyY**: CIE xyY to Munsell specification
+- **to_xyY**: Munsell specification to CIE xyY
+## Project Overview
+### Objective
+Provide 100-1000x speedup for batch Munsell conversions compared to colour-science routines while maintaining high accuracy.
+### Results
+**from_xyY** (CIE xyY to Munsell) — evaluated on all 2,734 REAL Munsell colors:
+| Model                                                    | Delta-E    | Speed (ms) |
+|----------------------------------------------------------| ---------- | ---------- |
+| Colour Library (Baseline)                                | 0.00       | 111.90     |
+| **Multi-ResNet + Multi-Error Predictor (Large Dataset)** | **0.52**   | 0.089      |
+| Multi-MLP (W+B) + Multi-Error Predictor (W+B) Large      | 0.52       | 0.057      |
+| Multi-MLP + Multi-Error Predictor (Large Dataset)        | 0.52       | 0.058      |
+| Multi-MLP + Multi-Error Predictor                        | 0.53       | 0.058      |
+| MLP + Error Predictor                                    | 0.53       | 0.030      |
+| Multi-ResNet (Large Dataset)                             | 0.54       | 0.044      |
+| Multi-Head + Multi-Error Predictor                       | 0.54       | 0.042      |
+| Multi-Head + Multi-Error Predictor (Large Dataset)       | 0.56       | 0.043      |
+| Deep + Wide                                              | 0.60       | 0.074      |
+| Multi-Head (Large Dataset)                               | 0.66       | 0.013      |
+| Mixture of Experts                                       | 0.80       | 0.020      |
+| Transformer (Large Dataset)                              | 0.82       | 0.123      |
+| Multi-MLP                                                | 0.86       | 0.027      |
+| MLP + Self-Attention                                     | 0.88       | 0.173      |
+| MLP (Base Only)                                          | 1.09       | **0.007**  |
+| Unified MLP                                              | 1.12       | 0.072      |
+- **Best Accuracy**: Multi-ResNet + Multi-Error Predictor (Large Dataset) — Delta-E 0.52, 1,252x faster
+- **Fastest**: MLP Base Only (0.007 ms/sample) — 15,492x faster than Colour library
+- **Best Balance**: Multi-MLP (W+B: Weighted Boundary) + Multi-Error Predictor (W+B) Large — 1,951x faster with Delta-E 0.52
+**to_xyY** (Munsell to CIE xyY) — evaluated on all 2,734 REAL Munsell colors:
+| Model                                         | Delta-E    | Speed (ms)  |
+| --------------------------------------------- | ---------- | ----------- |
+| Colour Library (Baseline)                     | 0.00       | 1.27        |
+| **Multi-MLP (Optimized)**                     | **0.48**   | 0.008       |
+| Multi-MLP (Opt) + Multi-Error Predictor (Opt) | 0.48       | 0.025       |
+| Multi-MLP + Multi-Error Predictor             | 0.65       | 0.030       |
+| Multi-MLP                                     | 0.66       | 0.016       |
+| Multi-MLP + Error Predictor                   | 0.67       | 0.018       |
+| Multi-Head (Optimized)                        | 0.71       | 0.015       |
+| Multi-Head                                    | 0.78       | 0.008       |
+| Multi-Head + Multi-Error Predictor            | 1.11       | 0.028       |
+| Simple MLP                                    | 1.42       | **0.0008**  |
+- **Best Accuracy**: Multi-MLP (Optimized) — Delta-E 0.48, 154x faster
+- **Fastest**: Simple MLP (0.0008 ms/sample) — 1,654x faster than Colour library
+### Approach
+- **25+ architectures** tested for from_xyY (MLP, Multi-Head, Multi-MLP, Multi-ResNet, Transformers, Mixture of Experts)
+- **9 architectures** tested for to_xyY (Simple MLP, Multi-Head, Multi-MLP with error predictors)
+- **Two-stage models** (base + error predictor) on large dataset proved most effective
+- **Best model**: Multi-ResNet + Multi-Error Predictor (Large Dataset) with Delta-E 0.52
+- **Training data**: ~1.4M samples from dense xyY grid with boundary refinement and forward Munsell sampling
+- **Deployment**: ONNX format with ONNX Runtime
+For detailed architecture comparisons, model benchmarks, training pipeline details, and experimental results, see [docs/learning_munsell.md](docs/learning_munsell.md).
+## Installation
+**Dependencies (Runtime)**:
+- numpy >= 2.0
+- onnxruntime >= 1.16
+**Dependencies (Training)**:
+- torch >= 2.0
+- scikit-learn >= 1.3
+- matplotlib >= 3.9
+- mlflow >= 2.10
+- optuna >= 3.0
+- colour-science >= 0.4.7
+- click >= 8.0
+- onnx >= 1.15
+- onnxscript >= 0.5.6
+- tqdm >= 4.66
+- jax >= 0.4.20
+- jaxlib >= 0.4.20
+- flax >= 0.10.7
+- optax >= 0.2.6
+- scipy >= 1.12
+- tensorboard >= 2.20
+From the project root:
+```bash
+cd learning-munsell
+# Install all dependencies (creates virtual environment automatically)
+uv sync
+```
+## Usage
+### Generate Training Data
+```bash
+uv run python learning_munsell/data_generation/generate_training_data.py
+```
+**Note**: This step is computationally expensive (uses iterative algorithm for ground truth).
+### Train Models
+**xyY to Munsell (from_xyY)**
+Best performing model (Multi-ResNet + Multi-Error Predictor on Large Dataset):
+```bash
+# Train base Multi-ResNet on large dataset (~1.4M samples)
+uv run python learning_munsell/training/from_xyY/train_multi_resnet_large.py
+# Train multi-error predictor
+uv run python learning_munsell/training/from_xyY/train_multi_resnet_error_predictor_large.py
+```
+Alternative (Multi-Head architecture):
+```bash
+uv run python learning_munsell/training/from_xyY/train_multi_head_large.py
+uv run python learning_munsell/training/from_xyY/train_multi_head_multi_error_predictor_large.py
+```
+Other architectures:
+```bash
+uv run python learning_munsell/training/from_xyY/train_unified_mlp.py
+uv run python learning_munsell/training/from_xyY/train_multi_mlp.py
+uv run python learning_munsell/training/from_xyY/train_mlp_attention.py
+uv run python learning_munsell/training/from_xyY/train_deep_wide.py
+uv run python learning_munsell/training/from_xyY/train_ft_transformer.py
+```
+**Munsell to xyY (to_xyY)**
+Best performing model (Multi-MLP Optimized):
+```bash
+uv run python learning_munsell/training/to_xyY/train_multi_mlp.py
+uv run python learning_munsell/training/to_xyY/train_multi_head.py
+uv run python learning_munsell/training/to_xyY/train_multi_mlp_multi_error_predictor.py
+uv run python learning_munsell/training/to_xyY/train_multi_mlp_error_predictor.py
+uv run python learning_munsell/training/to_xyY/train_multi_head_multi_error_predictor.py
+```
+Train the differentiable approximator for use in Delta-E loss:
+```bash
+uv run python learning_munsell/training/to_xyY/train_munsell_to_xyY_approximator.py
+```
+### Hyperparameter Search
+```bash
+uv run python learning_munsell/training/from_xyY/hyperparameter_search_multi_head.py
+uv run python learning_munsell/training/from_xyY/hyperparameter_search_multi_head_error_predictor.py
+```
+### Compare All Models
+```bash
+uv run python learning_munsell/comparison/from_xyY/compare_all_models.py
+```
+Generates comprehensive HTML report at `reports/from_xyY/model_comparison.html`.
+### Monitor Training
+**MLflow**:
+```bash
+uv run mlflow ui --backend-store-uri "sqlite:///mlruns.db" --port=5000
+```
+Open <http://localhost:5000> in your browser.
+## Directory Structure
+```
+learning-munsell/
++-- data/                          # Training data
+|   +-- training_data.npz          # Generated training samples
+|   +-- training_data_large.npz    # Large dataset (~1.4M samples)
+|   +-- training_data_params.json  # Generation parameters
+|   +-- training_data_large_params.json
++-- models/                        # Trained models (ONNX + PyTorch)
+|   +-- from_xyY/                  # xyY to Munsell models (25+ ONNX models)
+|   |   +-- multi_resnet_error_predictor_large.onnx  # BEST
+|   |   +-- ... (additional model variants)
+|   +-- to_xyY/                    # Munsell to xyY models (9 ONNX models)
+|       +-- multi_mlp_optimized.onnx  # BEST
+|       +-- ... (additional model variants)
++-- learning_munsell/              # Source code
+|   +-- analysis/                  # Analysis scripts
+|   +-- comparison/                # Model comparison scripts
+|   +-- data_generation/           # Data generation scripts
+|   +-- interpolation/             # Classical interpolation methods
+|   +-- losses/                    # Loss functions (JAX Delta-E)
+|   +-- models/                    # Model architecture definitions
+|   +-- training/                  # Model training scripts
+|   +-- utilities/                 # Shared utilities
++-- docs/                          # Documentation
++-- reports/                       # HTML comparison reports
++-- logs/                          # Script output logs
++-- mlruns.db                      # MLflow experiment tracking database
+```
+## About
+**Learning Munsell** by Colour Developers
+Research project for the Colour library
+<https://github.com/colour-science/colour>

docs/_static/gamma_sweep_plot.pdf ADDED Viewed

Binary file (22.2 kB). View file

docs/_static/gamma_sweep_plot.png ADDED Viewed

Git LFS Details

SHA256: e2a0d5dc57c0d37d5889cff4ac41a08b490387a54615d4372af5e5bd86018e36
Pointer size: 131 Bytes
Size of remote file: 137 kB

docs/learning_munsell.md ADDED Viewed

	@@ -0,0 +1,478 @@

+# Learning Munsell
+Technical documentation covering performance benchmarks, training methodology, architecture design, and experimental findings.
+## Overview
+This project implements ML models for bidirectional conversion between CIE xyY colorspace values and Munsell specifications:
+- **xyY to Munsell (from_xyY)**: 25+ architectures, best Delta-E 0.52
+- **Munsell to xyY (to_xyY)**: 9 architectures, best Delta-E 0.48
+### Delta-E Interpretation
+- **< 1.0**: Not perceptible by human eye
+- **1-2**: Perceptible through close observation
+- **2-10**: Perceptible at a glance
+- **> 10**: Colors are perceived as completely different
+Our best models achieve **Delta-E 0.48-0.52**, meaning the difference between ML prediction and iterative algorithm is **not perceptible by the human eye**.
+## xyY to Munsell (from_xyY)
+### Performance Benchmarks
+Comprehensive comparison using all 2,734 REAL Munsell colors:
+| Model                                                    | Delta-E     | Speed (ms) |
+|----------------------------------------------------------|-------------|------------|
+| Colour Library (Baseline)                                | 0.00        | 111.90     |
+| **Multi-ResNet + Multi-Error Predictor (Large Dataset)** | **0.52**    | 0.089      |
+| Multi-MLP (W+B) + Multi-Error Predictor (W+B) Large      | 0.52        | 0.057      |
+| Multi-MLP + Multi-Error Predictor (Large Dataset)        | 0.52        | 0.058      |
+| Multi-MLP + Multi-Error Predictor                        | 0.53        | 0.058      |
+| MLP + Error Predictor                                    | 0.53        | 0.030      |
+| Multi-ResNet (Large Dataset)                             | 0.54        | 0.044      |
+| Multi-Head + Multi-Error Predictor                       | 0.54        | 0.042      |
+| Multi-Head + Multi-Error Predictor (Large Dataset)       | 0.56        | 0.043      |
+| Deep + Wide                                              | 0.60        | 0.074      |
+| Multi-Head (Large Dataset)                               | 0.66        | 0.013      |
+| Mixture of Experts                                       | 0.80        | 0.020      |
+| Transformer (Large Dataset)                              | 0.82        | 0.123      |
+| Multi-MLP                                                | 0.86        | 0.027      |
+| MLP + Self-Attention                                     | 0.88        | 0.173      |
+| MLP (Base Only)                                          | 1.09        | **0.007**  |
+| Unified MLP                                              | 1.12        | 0.072      |
+Note: The Colour library baseline had 171 convergence failures out of 2,734 samples (6.3% failure rate).
+**Best Models**:
+- **Best Accuracy**: Multi-ResNet + Multi-Error Predictor (Large Dataset) - Delta-E 0.52
+- **Fastest**: MLP Base Only (0.007 ms/sample) - 15,492x faster than Colour library
+- **Best Balance**: Multi-MLP (W+B: Weighted Boundary) + Multi-Error Predictor (W+B) Large - 1,951x faster with Delta-E 0.52
+### Model Architectures
+25+ architectures were systematically evaluated:
+**Single-Stage Models**
+1.  **MLP (Base Only)** - Simple MLP network, 3 inputs to 4 outputs
+2.  **Unified MLP** - Single large MLP with shared features
+3.  **Multi-Head** - Shared encoder with 4 independent decoder heads
+4.  **Multi-Head (Large Dataset)** - Multi-Head trained on 1.4M samples
+5.  **Multi-MLP** - 4 completely independent MLP branches (one per output)
+6.  **Multi-MLP (Large Dataset)** - Multi-MLP trained on 1.4M samples
+7.  **MLP + Self-Attention** - MLP with attention mechanism for feature weighting
+8.  **Deep + Wide** - Combined deep and wide network paths
+9.  **Mixture of Experts** - Gating network selecting specialized expert networks
+10. **Transformer (Large Dataset)** - Feature Tokenizer Transformer for tabular data
+11. **FT-Transformer** - Feature Tokenizer Transformer (standard size)
+**Two-Stage Models**
+12. **MLP + Error Predictor** - Base MLP with unified error correction
+13. **Multi-Head + Multi-Error Predictor** - Multi-Head with 4 independent error predictors
+14. **Multi-Head + Multi-Error Predictor (Large Dataset)** - Large dataset variant
+15. **Multi-MLP + Multi-Error Predictor** - 4 independent branches with 4 independent error predictors
+16. **Multi-MLP + Multi-Error Predictor (Large Dataset)** - Large dataset variant
+17. **Multi-ResNet + Multi-Error Predictor (Large Dataset)** - Deep ResNet-style branches (BEST)
+The **Multi-ResNet + Multi-Error Predictor (Large Dataset)** architecture achieved the best results with Delta-E 0.52.
+### Training Methodology
+**Data Generation**
+1.  **Dense xyY Grid** (~500K samples)
+    - Regular grid in valid xyY space (MacAdam limits for Illuminant C)
+    - Captures general input distribution
+2.  **Boundary Refinement** (~700K samples)
+    - Adaptive dense sampling near Munsell gamut boundaries
+    - Uses `maximum_chroma_from_renotation` to detect edges
+    - Focuses on regions where iterative algorithm is most complex
+    - Includes Y/GY/G hue regions with high value/chroma (challenging areas)
+3.  **Forward Augmentation** (~200K samples)
+    - Dense Munsell space sampling via `munsell_specification_to_xyY`
+    - Ensures coverage of known valid colors
+Total: ~1.4M samples for large dataset training.
+**Loss Functions**
+Two loss function approaches were tested:
+*Precision-Focused Loss* (Default):
+```
+total_loss = 1.0 * MSE + 0.5 * MAE + 0.3 * log_penalty + 0.5 * huber_loss
+```
+- MSE: Standard mean squared error
+- MAE: Mean absolute error
+- Log penalty: Heavily penalizes small errors (pushes toward high precision)
+- Huber loss: Small delta (0.01) for precision on small errors
+*Pure MSE Loss* (Optimized config):
+```
+total_loss = MSE
+```
+Interestingly, the precision-focused loss achieved better Delta-E despite higher validation MSE, suggesting the custom weighting better correlates with perceptual accuracy.
+### Design Rationale
+**Two-Stage Architecture**
+The error predictor stage corrects systematic biases in the base model:
+1.  Base model learns the general xyY to Munsell mapping
+2.  Error predictor learns residual corrections specific to each component
+3.  Combined prediction: `final = base_prediction + error_correction`
+This decomposition allows each stage to specialize and reduces the complexity each network must learn.
+**Independent Branch Design**
+Munsell components have different characteristics:
+- **Hue**: Circular (0-10, wrapping), most complex
+- **Value**: Linear (0-10), easiest to predict
+- **Chroma**: Highly variable range depending on hue/value
+- **Code**: Discrete hue sector (0-9)
+Shared encoders force compromises between these different prediction tasks. Independent branches allow full specialization.
+**Architecture Details**
+*MLP (Base Only)*
+Simple feedforward network predicting all 4 outputs simultaneously:
+    Input (3) ──► Linear Layers ──► Output (4: hue, value, chroma, code)
+- Smallest model (~8KB ONNX)
+- Fastest inference (0.007 ms)
+- Baseline for comparison
+*Unified MLP*
+Single large MLP with shared internal features:
+    Input (3) ──► 128 ──► 256 ──► 512 ──► 256 ──► 128 ──► Output (4)
+- Shared representations across all outputs
+- Moderate size, good speed
+*Multi-Head MLP*
+Shared encoder with specialized decoder heads:
+    Input (3) ──► SHARED ENCODER (3→128→256→512) ──┬──► Hue Head (512→256→128→1)
+                                                   ├──► Value Head (512→256→128→1)
+                                                   ├──► Chroma Head (512→384→256→128→1)
+                                                   └──► Code Head (512→256→128→1)
+- Shared encoder learns common color space features
+- 4 specialized decoder heads branch from shared representation
+- Parameter efficient (encoder weights shared)
+- Fast inference (encoder computed once)
+*Multi-MLP*
+Fully independent branches with no weight sharing:
+    Input (3) ──► Hue Branch    (3→128→256→512→256→128→1)
+    Input (3) ──► Value Branch  (3→128→256→512→256→128→1)
+    Input (3) ──► Chroma Branch (3→256→512→1024→512→256→1)  [2x wider]
+    Input (3) ──► Code Branch   (3→128→256→512→256→128→1)
+- 4 completely independent MLPs
+- Each branch learns its own features from scratch
+- Chroma branch is wider (2x) to handle its complexity
+- Better accuracy than Multi-Head on large dataset (Delta-E 0.52 vs 0.56 with error predictors)
+*Multi-ResNet*
+Deep branches with residual-style connections:
+    Input (3) ──► Hue Branch    (3→256→512→512→512→256→1)    [6 layers]
+    Input (3) ──► Value Branch  (3→256→512→512→512→256→1)    [6 layers]
+    Input (3) ──► Chroma Branch (3→512→1024→1024→1024→512→1) [6 layers, 2x wider]
+    Input (3) ──► Code Branch   (3→256→512→512→512→256→1)    [6 layers]
+- Deeper architecture than Multi-MLP
+- BatchNorm + SiLU activation
+- Best accuracy when combined with error predictor (Delta-E 0.52)
+- Largest model (~14MB base, ~28MB with error predictor)
+*Deep + Wide*
+Combined deep and wide network paths:
+    Input (3) ──┬──► Deep Path (multiple layers) ──┬──► Concat ──► Output (4)
+                └──► Wide Path (direct connection) ─┘
+- Deep path captures complex patterns
+- Wide path preserves direct input information
+- Good for mixed linear/nonlinear relationships
+*MLP + Self-Attention*
+MLP with attention mechanism for feature weighting:
+    Input (3) ──► MLP ──► Self-Attention ──► Output (4)
+- Attention weights learn feature importance
+- Slower due to attention computation (0.173 ms)
+- Did not improve over simpler MLPs
+*Mixture of Experts*
+Gating network selecting specialized expert networks:
+    Input (3) ──► Gating Network ──► Weighted sum of Expert outputs ──► Output (4)
+- Multiple expert networks specialize in different input regions
+- Gating network learns which expert to use
+- More complex but did not outperform Multi-MLP
+*FT-Transformer*
+Feature Tokenizer Transformer for tabular data:
+    Input (3) ──► Feature Tokenizer ──► Transformer Blocks ──► Output (4)
+- Each input feature tokenized separately
+- Self-attention across feature tokens
+- Good for tabular data with feature interactions
+- Slower inference due to attention computation
+*Error Predictor (Two-Stage)*
+Second-stage network that corrects base model errors:
+    Stage 1: Input (3) ──► Base Model ──► Base Prediction (4)
+    Stage 2: [Input (3), Base Prediction (4)] ──► Error Predictor ──► Error Correction (4)
+    Final:   Base Prediction + Error Correction = Final Output
+- Learns residual corrections for each component
+- Can have unified (1 network) or multi (4 networks) error predictors
+- Consistently improves accuracy across all base architectures
+- Best results: Multi-ResNet + Multi-Error Predictor (Delta-E 0.52)
+**Loss-Metric Mismatch**
+An important finding: **optimizing MSE does not optimize Delta-E**.
+The Optuna hyperparameter search minimized validation MSE, but the best MSE configuration did not achieve the best Delta-E. This is because:
+- MSE treats all component errors equally
+- Delta-E (CIE2000) weights errors based on human perception
+- The precision-focused loss with custom weights better approximates perceptual importance
+**Weighted Boundary Loss (Experimental)**
+Analysis of model errors revealed systematic underperformance on Y/GY/G hues (Yellow/Green-Yellow/Green) with high value and chroma. The weighted boundary loss approach was explored to address this by:
+1.  Applying 3x loss weight to samples in challenging regions:
+    - Hue: 0.18-0.35 (normalized range covering Y/YG/G)
+    - Value > 0.7 (high brightness)
+    - Chroma > 0.5 (high saturation)
+2.  Adding boundary penalty to prevent predictions exceeding Munsell gamut limits
+**Finding**: The large dataset approach (~1.4M samples with dense boundary sampling) naturally provides sufficient coverage of these challenging regions. Both the weighted boundary loss model (Multi-MLP W+B + Multi-Error Predictor W+B Large, Delta-E 0.524) and the standard large dataset model (Multi-MLP + Multi-Error Predictor Large, Delta-E 0.525) achieve nearly identical results, making explicit loss weighting optional. The best overall model is Multi-ResNet + Multi-Error Predictor (Large Dataset) with Delta-E 0.52.
+### Experimental Findings
+The following experiments were conducted but did not improve results:
+**Delta-E Training**
+Training with differentiable Delta-E CIE2000 loss via round-trip through the Munsell-to-xyY approximator.
+*Hypothesis*: Perceptual Delta-E loss might outperform MSE-trained models.
+*Implementation*: JAX/Flax model with combined MSE + Delta-E loss. Requires lower learning rate (1e-4 vs 3e-4) for stability; higher rates cause NaN gradients.
+*Results*: While Delta-E is comparable, **hue accuracy is ~10x worse**:
+| Metric (Normalized MAE)  | Delta-E Model | MSE Model |
+|--------------------------|---------------|-----------|
+| Hue MAE                  | 0.30          | 0.03      |
+| Value MAE                | 0.002         | 0.004     |
+| Chroma MAE               | 0.007         | 0.008     |
+| Code MAE                 | 0.07          | 0.01      |
+| **Delta-E (perceptual)** | **0.52**      | **0.50**  |
+*Key Takeaway*: **Perceptual similarity != specification accuracy**. The MSE model's slightly better Delta-E (0.50 vs 0.52) comes at the cost of ~10x worse hue accuracy, making it unsuitable for specification prediction. Delta-E is too permissive for hue, allowing the model to find "shortcuts" that minimize perceptual difference without correctly predicting the Munsell specification.
+**Classical Interpolation**
+Classical interpolation methods were tested on 4,995 reference Munsell colors (80% train / 20% test split). ML evaluated on 2,734 REAL Munsell colors.
+*Results (Validation MAE)*:
+| Component | RBF  | KD-Tree | Delaunay | ML (Best) |
+|-----------|------|---------|----------|-----------|
+| Hue       | 1.40 | 1.40    | 1.29     | **0.03**  |
+| Value     | 0.01 | 0.10    | 0.02     | 0.05      |
+| Chroma    | 0.22 | 0.99    | 0.35     | **0.11**  |
+| Code      | 0.33 | 0.28    | 0.28     | **0.00**  |
+*Key Insight*: The reference dataset (4,995 colors) is too sparse for 3D xyY interpolation. Classical methods fail on hue prediction (MAE ~1.3-1.4), while ML achieves 47x better hue accuracy and 2-3x better chroma/code accuracy.
+**Circular Hue Loss**
+Circular distance metrics for hue prediction, accounting for cyclic nature (0-10 wraps).
+*Results*: The circular loss model performed **21x worse** on hue MAE (5.14 vs 0.24).
+*Key Takeaway*: **Mathematical correctness != training effectiveness**. The circular distance creates gradient discontinuities that harm optimization.
+**REAL-Only Refinement**
+Fine-tuning using only REAL Munsell colors (2,734) instead of ALL colors (4,995).
+*Results*: Essentially identical performance (Delta-E 1.5233 vs 1.5191).
+*Key Takeaway*: **Data quality is not the bottleneck**. Both REAL and extrapolated colors are sufficiently accurate.
+**Gamma Normalization**
+Gamma correction to the Y (luminance) channel during normalization.
+*Results*: No consistent improvement across gamma values 1.0-3.0:
+| Gamma          | Median ΔE (± std) |
+|----------------|-------------------|
+| 1.0 (baseline) | 0.730 ± 0.054     |
+| 2.5 (best)     | 0.683 ± 0.132     |
+![Gamma sweep results](_static/gamma_sweep_plot.png)
+*Key Takeaway*: **Gamma normalization does not provide consistent improvement**. Standard deviations overlap - differences are within noise.
+## Munsell to xyY (to_xyY)
+### Performance Benchmarks
+Comprehensive comparison using all 2,734 REAL Munsell colors:
+| Model                                         | Delta-E     | Speed (ms) |
+|-----------------------------------------------|-------------|------------|
+| Colour Library (Baseline)                     | 0.00        | 1.27       |
+| **Multi-MLP (Optimized)**                     | **0.48**    | 0.008      |
+| Multi-MLP (Opt) + Multi-Error Predictor (Opt) | 0.48        | 0.025      |
+| Multi-MLP + Multi-Error Predictor             | 0.65        | 0.030      |
+| Multi-MLP                                     | 0.66        | 0.016      |
+| Multi-MLP + Error Predictor                   | 0.67        | 0.018      |
+| Multi-Head (Optimized)                        | 0.71        | 0.015      |
+| Multi-Head                                    | 0.78        | 0.008      |
+| Multi-Head + Multi-Error Predictor            | 1.11        | 0.028      |
+| Simple MLP                                    | 1.42        | **0.0008** |
+**Best Models**:
+- **Best Accuracy**: Multi-MLP (Optimized) - Delta-E 0.48
+- **Fastest**: Simple MLP (0.0008 ms/sample) - 1,654x faster than Colour library
+- **Best Balance**: Multi-MLP (Optimized) - 154x faster with Delta-E 0.48
+### Model Architectures
+9 architectures were evaluated for the Munsell to xyY direction:
+**Single-Stage Models**
+1.  **Simple MLP** - Basic MLP network, 4 inputs to 3 outputs
+2.  **Multi-Head** - Shared encoder with 3 independent decoder heads (x, y, Y)
+3.  **Multi-Head (Optimized)** - Hyperparameter-optimized variant
+4.  **Multi-MLP** - 3 completely independent MLP branches
+5.  **Multi-MLP (Optimized)** - Hyperparameter-optimized variant (BEST)
+**Two-Stage Models**
+6.  **Multi-MLP + Error Predictor** - Base Multi-MLP with unified error correction
+7.  **Multi-MLP + Multi-Error Predictor** - 3 independent error predictors
+8.  **Multi-MLP (Opt) + Multi-Error Predictor (Opt)** - Optimized two-stage
+9.  **Multi-Head + Multi-Error Predictor** - Multi-Head with error correction
+The **Multi-MLP (Optimized)** architecture achieved the best results with Delta-E 0.48.
+### Differentiable Approximator
+A small MLP (68K parameters) trained to approximate the Munsell to xyY conversion for use in differentiable Delta-E loss:
+- **Architecture**: 4 -> 128 -> 256 -> 128 -> 3 with LayerNorm + SiLU
+- **Accuracy**: MAE ~0.0006 for x, y, and Y components
+- **Output formats**: PyTorch (.pth), ONNX, and JAX-compatible weights (.npz)
+This enables differentiable Munsell to xyY conversion, which was previously only possible through non-differentiable lookup tables.
+## Shared Infrastructure
+### Hyperparameter Optimization
+Optuna was used for systematic hyperparameter search over:
+- Learning rate (1e-4 to 1e-3)
+- Batch size (256, 512, 1024)
+- Dropout rate (0.0 to 0.2)
+- Chroma branch width multiplier (1.0 to 2.0)
+- Loss function weights (MSE, Huber)
+Key finding: **No dropout (0.0)** consistently performed better across all models in both conversion directions, contrary to typical deep learning recommendations for regularization.
+### Training Infrastructure
+- **Optimizer**: AdamW with weight decay
+- **Scheduler**: ReduceLROnPlateau (patience=10, factor=0.5)
+- **Early stopping**: Patience=20 epochs
+- **Checkpointing**: Best model saved based on validation loss
+- **Logging**: MLflow for experiment tracking
+### JAX Delta-E Implementation
+Located in `learning_munsell/losses/jax_delta_e.py`:
+- Differentiable xyY -> XYZ -> Lab color space conversions
+- Full CIE 2000 Delta-E implementation with gradient support
+- JIT-compiled functions for performance
+Usage:
+```python
+from learning_munsell.losses import delta_E_loss, delta_E_CIE2000
+# Compute perceptual loss between predicted and target xyY
+loss = delta_E_loss(pred_xyY, target_xyY)
+```
+## Limitations
+### BatchNorm Instability on MPS
+Models using `BatchNorm1d` layers exhibit numerical instability when trained on Apple Silicon GPUs via the MPS backend:
+1.  **Validation loss spikes** during training
+2.  **Occasional extreme outputs** during inference (e.g., 20M instead of ~0.1)
+3.  **Non-reproducible behavior**
+**Affected Models**: Large dataset error predictors using BatchNorm.
+**Workarounds**:
+1.  Use CPU for training
+2.  Replace BatchNorm with LayerNorm
+3.  Use smaller models (300K samples vs 2M)
+4.  Skip error predictor stage for affected models
+The recommended production model (`multi_resnet_error_predictor_large.onnx`) was trained on the large dataset and does not exhibit this instability.
+**References**:
+- [BatchNorm non-trainable exception](https://github.com/pytorch/pytorch/issues/98602)
+- [ONNX export incorrect on MPS](https://github.com/pytorch/pytorch/issues/83230)
+- [MPS kernel bugs](https://elanapearl.github.io/blog/2025/the-bug-that-taught-me-pytorch/)

learning_munsell/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Learning Munsell - Machine Learning for Munsell Color Conversions."""
+from pathlib import Path
+__all__ = ["PROJECT_ROOT"]
+PROJECT_ROOT = Path(__file__).parent.parent

learning_munsell/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Analysis utilities for Munsell color conversion models."""

learning_munsell/analysis/error_analysis.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+Analyze error distribution to identify problematic regions in Munsell space.
+This script:
+1. Runs the best model on all REAL Munsell colors
+2. Computes Delta-E for each sample
+3. Identifies samples with high error (Delta-E > threshold)
+4. Analyzes patterns: which hue families, value ranges, chroma ranges have issues
+5. Outputs statistics and visualizations
+"""
+import logging
+from collections import defaultdict
+import numpy as np
+import onnxruntime as ort
+from colour import XYZ_to_Lab, xyY_to_XYZ
+from colour.difference import delta_E_CIE2000
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_REAL
+from colour.notation.munsell import (
+    CCS_ILLUMINANT_MUNSELL,
+    munsell_colour_to_munsell_specification,
+    munsell_specification_to_xyY,
+)
+from learning_munsell import PROJECT_ROOT
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+HUE_NAMES = {
+    1: "R",
+    2: "YR",
+    3: "Y",
+    4: "GY",
+    5: "G",
+    6: "BG",
+    7: "B",
+    8: "PB",
+    9: "P",
+    10: "RP",
+    0: "RP",
+}
+def load_model_and_params(model_name: str):
+    """Load ONNX model and normalization parameters."""
+    model_dir = PROJECT_ROOT / "models" / "from_xyY"
+    model_path = model_dir / f"{model_name}.onnx"
+    params_path = model_dir / f"{model_name}_normalization_params.npz"
+    if not model_path.exists():
+        raise FileNotFoundError(f"Model not found: {model_path}")
+    if not params_path.exists():
+        raise FileNotFoundError(f"Params not found: {params_path}")
+    session = ort.InferenceSession(str(model_path))
+    params = np.load(params_path, allow_pickle=True)
+    input_params = params["input_params"].item()
+    output_params = params["output_params"].item()
+    return session, input_params, output_params
+def normalize_input(xyY: np.ndarray, params: dict) -> np.ndarray:
+    """Normalize xyY input."""
+    normalized = np.copy(xyY).astype(np.float32)
+    # Scale Y from 0-100 to 0-1 range before normalization
+    normalized[..., 2] = xyY[..., 2] / 100.0
+    normalized[..., 0] = (xyY[..., 0] - params["x_range"][0]) / (
+        params["x_range"][1] - params["x_range"][0]
+    )
+    normalized[..., 1] = (xyY[..., 1] - params["y_range"][0]) / (
+        params["y_range"][1] - params["y_range"][0]
+    )
+    normalized[..., 2] = (normalized[..., 2] - params["Y_range"][0]) / (
+        params["Y_range"][1] - params["Y_range"][0]
+    )
+    return normalized
+def denormalize_output(pred: np.ndarray, params: dict) -> np.ndarray:
+    """Denormalize Munsell output."""
+    denorm = np.copy(pred)
+    denorm[..., 0] = (
+        pred[..., 0] * (params["hue_range"][1] - params["hue_range"][0])
+        + params["hue_range"][0]
+    )
+    denorm[..., 1] = (
+        pred[..., 1] * (params["value_range"][1] - params["value_range"][0])
+        + params["value_range"][0]
+    )
+    denorm[..., 2] = (
+        pred[..., 2] * (params["chroma_range"][1] - params["chroma_range"][0])
+        + params["chroma_range"][0]
+    )
+    denorm[..., 3] = (
+        pred[..., 3] * (params["code_range"][1] - params["code_range"][0])
+        + params["code_range"][0]
+    )
+    return denorm
+def compute_delta_e(pred_spec: np.ndarray, gt_xyY: np.ndarray) -> float:
+    """Compute Delta-E between predicted spec (via xyY) and ground truth xyY."""
+    try:
+        pred_xyY = munsell_specification_to_xyY(pred_spec)
+        pred_XYZ = xyY_to_XYZ(pred_xyY)
+        pred_Lab = XYZ_to_Lab(pred_XYZ, CCS_ILLUMINANT_MUNSELL)
+        # Ground truth Y is in 0-100 range, need to scale to 0-1
+        gt_xyY_scaled = gt_xyY.copy()
+        gt_xyY_scaled[2] = gt_xyY[2] / 100.0
+        gt_XYZ = xyY_to_XYZ(gt_xyY_scaled)
+        gt_Lab = XYZ_to_Lab(gt_XYZ, CCS_ILLUMINANT_MUNSELL)
+        return delta_E_CIE2000(gt_Lab, pred_Lab)
+    except Exception:
+        return np.nan
+def analyze_errors(model_name: str = "multi_head_large", threshold: float = 3.0):
+    """Analyze error distribution for a model."""
+    LOGGER.info("=" * 80)
+    LOGGER.info("Error Analysis for %s", model_name)
+    LOGGER.info("=" * 80)
+    # Load model
+    session, input_params, output_params = load_model_and_params(model_name)
+    input_name = session.get_inputs()[0].name
+    # Collect data
+    results = []
+    for munsell_spec_tuple, xyY_gt in MUNSELL_COLOURS_REAL:
+        hue_code_str, value, chroma = munsell_spec_tuple
+        munsell_str = f"{hue_code_str} {value}/{chroma}"
+        try:
+            gt_spec = munsell_colour_to_munsell_specification(munsell_str)
+            gt_xyY = np.array(xyY_gt)
+            # Predict
+            xyY_norm = normalize_input(gt_xyY.reshape(1, 3), input_params)
+            pred_norm = session.run(None, {input_name: xyY_norm})[0]
+            pred_spec = denormalize_output(pred_norm, output_params)[0]
+            # Clamp to valid ranges
+            pred_spec[0] = np.clip(pred_spec[0], 0.5, 10.0)
+            pred_spec[1] = np.clip(pred_spec[1], 1.0, 9.0)
+            pred_spec[2] = np.clip(pred_spec[2], 0.0, 50.0)
+            pred_spec[3] = np.clip(pred_spec[3], 1.0, 10.0)
+            pred_spec[3] = np.round(pred_spec[3])
+            # Compute Delta-E
+            delta_e = compute_delta_e(pred_spec, gt_xyY)
+            if not np.isnan(delta_e):
+                results.append({
+                    "munsell_str": munsell_str,
+                    "gt_spec": gt_spec,
+                    "pred_spec": pred_spec,
+                    "delta_e": delta_e,
+                    "hue": gt_spec[0],
+                    "value": gt_spec[1],
+                    "chroma": gt_spec[2],
+                    "code": int(gt_spec[3]),
+                    "gt_xyY": gt_xyY,
+                })
+        except Exception as e:
+            LOGGER.warning("Failed for %s: %s", munsell_str, e)
+    LOGGER.info("\nTotal samples evaluated: %d", len(results))
+    # Overall statistics
+    delta_es = [r["delta_e"] for r in results]
+    LOGGER.info("\nOverall Delta-E Statistics:")
+    LOGGER.info("  Mean:   %.4f", np.mean(delta_es))
+    LOGGER.info("  Median: %.4f", np.median(delta_es))
+    LOGGER.info("  Std:    %.4f", np.std(delta_es))
+    LOGGER.info("  Min:    %.4f", np.min(delta_es))
+    LOGGER.info("  Max:    %.4f", np.max(delta_es))
+    # Distribution
+    LOGGER.info("\nDelta-E Distribution:")
+    for thresh in [1.0, 2.0, 3.0, 5.0, 10.0]:
+        count = sum(1 for d in delta_es if d <= thresh)
+        pct = 100 * count / len(delta_es)
+        LOGGER.info("  <= %.1f: %4d (%.1f%%)", thresh, count, pct)
+    # High error samples
+    high_error = [r for r in results if r["delta_e"] > threshold]
+    LOGGER.info("\nSamples with Delta-E > %.1f: %d (%.1f%%)",
+                threshold, len(high_error), 100 * len(high_error) / len(results))
+    # Analyze by hue family
+    LOGGER.info("\n" + "=" * 40)
+    LOGGER.info("Analysis by Hue Family")
+    LOGGER.info("=" * 40)
+    by_hue = defaultdict(list)
+    for r in results:
+        hue_name = HUE_NAMES.get(r["code"], f"?{r['code']}")
+        by_hue[hue_name].append(r["delta_e"])
+    LOGGER.info("\n%-4s  %5s  %6s  %6s  %6s  %s",
+                "Hue", "Count", "Mean", "Median", "Max", ">3.0")
+    for hue_name in ["R", "YR", "Y", "GY", "G", "BG", "B", "PB", "P", "RP"]:
+        if hue_name in by_hue:
+            des = by_hue[hue_name]
+            high = sum(1 for d in des if d > 3.0)
+            LOGGER.info("%-4s  %5d  %6.2f  %6.2f  %6.2f  %d (%.0f%%)",
+                        hue_name, len(des), np.mean(des), np.median(des),
+                        np.max(des), high, 100*high/len(des))
+    # Analyze by value range
+    LOGGER.info("\n" + "=" * 40)
+    LOGGER.info("Analysis by Value Range")
+    LOGGER.info("=" * 40)
+    value_ranges = [(1, 3), (3, 5), (5, 7), (7, 9)]
+    LOGGER.info("\n%-8s  %5s  %6s  %6s  %6s  %s",
+                "Value", "Count", "Mean", "Median", "Max", ">3.0")
+    for v_min, v_max in value_ranges:
+        des = [r["delta_e"] for r in results if v_min <= r["value"] < v_max]
+        if des:
+            high = sum(1 for d in des if d > 3.0)
+            LOGGER.info("[%d-%d)    %5d  %6.2f  %6.2f  %6.2f  %d (%.0f%%)",
+                        v_min, v_max, len(des), np.mean(des), np.median(des),
+                        np.max(des), high, 100*high/len(des) if des else 0)
+    # Analyze by chroma range
+    LOGGER.info("\n" + "=" * 40)
+    LOGGER.info("Analysis by Chroma Range")
+    LOGGER.info("=" * 40)
+    chroma_ranges = [(0, 4), (4, 8), (8, 12), (12, 20), (20, 50)]
+    LOGGER.info("\n%-8s  %5s  %6s  %6s  %6s  %s",
+                "Chroma", "Count", "Mean", "Median", "Max", ">3.0")
+    for c_min, c_max in chroma_ranges:
+        des = [r["delta_e"] for r in results if c_min <= r["chroma"] < c_max]
+        if des:
+            high = sum(1 for d in des if d > 3.0)
+            LOGGER.info("[%2d-%2d)   %5d  %6.2f  %6.2f  %6.2f  %d (%.0f%%)",
+                        c_min, c_max, len(des), np.mean(des), np.median(des),
+                        np.max(des), high, 100*high/len(des) if des else 0)
+    # Top 20 worst samples
+    LOGGER.info("\n" + "=" * 40)
+    LOGGER.info("Top 20 Worst Samples")
+    LOGGER.info("=" * 40)
+    worst = sorted(results, key=lambda r: r["delta_e"], reverse=True)[:20]
+    LOGGER.info("\n%-15s  %6s  %-20s  %-20s",
+                "Munsell", "DeltaE", "GT Spec", "Pred Spec")
+    for r in worst:
+        gt = f"[{r['gt_spec'][0]:.1f}, {r['gt_spec'][1]:.1f}, {r['gt_spec'][2]:.1f}, {int(r['gt_spec'][3])}]"
+        pred = f"[{r['pred_spec'][0]:.1f}, {r['pred_spec'][1]:.1f}, {r['pred_spec'][2]:.1f}, {int(r['pred_spec'][3])}]"
+        LOGGER.info("%-15s  %6.2f  %-20s  %-20s",
+                    r["munsell_str"], r["delta_e"], gt, pred)
+    # Analyze component errors for high-error samples
+    LOGGER.info("\n" + "=" * 40)
+    LOGGER.info("Component Errors for High-Error Samples (Delta-E > %.1f)", threshold)
+    LOGGER.info("=" * 40)
+    if high_error:
+        hue_errors = [abs(r["pred_spec"][0] - r["gt_spec"][0]) for r in high_error]
+        value_errors = [abs(r["pred_spec"][1] - r["gt_spec"][1]) for r in high_error]
+        chroma_errors = [abs(r["pred_spec"][2] - r["gt_spec"][2]) for r in high_error]
+        code_errors = [abs(r["pred_spec"][3] - r["gt_spec"][3]) for r in high_error]
+        LOGGER.info("\n%-10s  %6s  %6s  %6s",
+                    "Component", "Mean", "Median", "Max")
+        LOGGER.info("%-10s  %6.2f  %6.2f  %6.2f", "Hue",
+                    np.mean(hue_errors), np.median(hue_errors), np.max(hue_errors))
+        LOGGER.info("%-10s  %6.2f  %6.2f  %6.2f", "Value",
+                    np.mean(value_errors), np.median(value_errors), np.max(value_errors))
+        LOGGER.info("%-10s  %6.2f  %6.2f  %6.2f", "Chroma",
+                    np.mean(chroma_errors), np.median(chroma_errors), np.max(chroma_errors))
+        LOGGER.info("%-10s  %6.2f  %6.2f  %6.2f", "Code",
+                    np.mean(code_errors), np.median(code_errors), np.max(code_errors))
+    return results
+def main():
+    """Run error analysis."""
+    # Try the best models
+    models = [
+        "multi_head_large",
+    ]
+    for model_name in models:
+        try:
+            analyze_errors(model_name, threshold=3.0)
+        except FileNotFoundError as e:
+            LOGGER.warning("Skipping %s: %s", model_name, e)
+        LOGGER.info("\n")
+if __name__ == "__main__":
+    main()

learning_munsell/comparison/from_xyY/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Comparison scripts for xyY to Munsell conversion models."""

learning_munsell/comparison/from_xyY/compare_all_models.py ADDED Viewed

	@@ -0,0 +1,1292 @@

+"""
+Compare all ML models for xyY to Munsell conversion on real Munsell data.
+Models to compare:
+1. MLP (Base only)
+2. MLP + Error Predictor (Two-stage)
+3. Unified MLP
+4. MLP + Self-Attention
+5. MLP + Self-Attention + Error Predictor
+6. Deep + Wide
+7. Mixture of Experts
+8. FT-Transformer
+"""
+import logging
+import time
+import warnings
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import numpy as np
+import onnxruntime as ort
+from colour import XYZ_to_Lab, xyY_to_XYZ
+from colour.difference import delta_E_CIE2000
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_REAL
+from colour.notation.munsell import (
+    CCS_ILLUMINANT_MUNSELL,
+    munsell_colour_to_munsell_specification,
+    munsell_specification_to_xyY,
+    xyY_to_munsell_specification,
+)
+from numpy.typing import NDArray
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.utilities.common import (
+    benchmark_inference_speed,
+    get_model_size_mb,
+)
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+def normalize_input(X: NDArray, params: dict[str, Any] | None) -> NDArray:
+    """Normalize xyY input.
+    If params is None, xyY is assumed to already be in [0, 1] range (no normalization needed).
+    """
+    if params is None:
+        # xyY is already in [0, 1] range - no normalization needed
+        return X.astype(np.float32)
+    X_norm = np.copy(X)
+    X_norm[..., 0] = (X[..., 0] - params["x_range"][0]) / (
+        params["x_range"][1] - params["x_range"][0]
+    )
+    X_norm[..., 1] = (X[..., 1] - params["y_range"][0]) / (
+        params["y_range"][1] - params["y_range"][0]
+    )
+    X_norm[..., 2] = (X[..., 2] - params["Y_range"][0]) / (
+        params["Y_range"][1] - params["Y_range"][0]
+    )
+    return X_norm.astype(np.float32)
+def denormalize_output(y_norm: NDArray, params: dict[str, Any]) -> NDArray:
+    """Denormalize Munsell output."""
+    y = np.copy(y_norm)
+    y[..., 0] = (
+        y_norm[..., 0] * (params["hue_range"][1] - params["hue_range"][0])
+        + params["hue_range"][0]
+    )
+    y[..., 1] = (
+        y_norm[..., 1] * (params["value_range"][1] - params["value_range"][0])
+        + params["value_range"][0]
+    )
+    y[..., 2] = (
+        y_norm[..., 2] * (params["chroma_range"][1] - params["chroma_range"][0])
+        + params["chroma_range"][0]
+    )
+    y[..., 3] = (
+        y_norm[..., 3] * (params["code_range"][1] - params["code_range"][0])
+        + params["code_range"][0]
+    )
+    return y
+def clamp_munsell_specification(specification: NDArray) -> NDArray:
+    """Clamp Munsell specification to valid ranges."""
+    clamped = np.copy(specification)
+    clamped[..., 0] = np.clip(specification[..., 0], 0.0, 10.0)  # Hue: [0, 10]
+    clamped[..., 1] = np.clip(specification[..., 1], 1.0, 9.0)  # Value: [1, 9] (colour library constraint)
+    clamped[..., 2] = np.clip(specification[..., 2], 0.0, 50.0)  # Chroma: [0, 50]
+    clamped[..., 3] = np.clip(specification[..., 3], 1.0, 10.0)  # Code: [1, 10]
+    return clamped
+def evaluate_model(
+    session: ort.InferenceSession,
+    X_norm: NDArray,
+    ground_truth: NDArray,
+    params: dict[str, Any],
+    input_name: str = "xyY",
+    reference_Lab: NDArray | None = None,
+) -> dict[str, Any]:
+    """Evaluate a single model."""
+    pred_norm = session.run(None, {input_name: X_norm})[0]
+    pred = denormalize_output(pred_norm, params)
+    errors = np.abs(pred - ground_truth)
+    result = {
+        "hue_mae": np.mean(errors[:, 0]),
+        "value_mae": np.mean(errors[:, 1]),
+        "chroma_mae": np.mean(errors[:, 2]),
+        "code_mae": np.mean(errors[:, 3]),
+        "max_errors": np.max(errors, axis=1),
+        "hue_errors": errors[:, 0],
+        "value_errors": errors[:, 1],
+        "chroma_errors": errors[:, 2],
+        "code_errors": errors[:, 3],
+    }
+    # Compute Delta-E against ground truth
+    if reference_Lab is not None:
+        delta_E_values = []
+        for idx in range(len(pred)):
+            try:
+                # Convert ML prediction to Lab: Munsell spec → xyY → XYZ → Lab
+                ml_spec = clamp_munsell_specification(pred[idx])
+                # Round Code to nearest integer before round-trip conversion
+                ml_spec_for_conversion = ml_spec.copy()
+                ml_spec_for_conversion[3] = round(ml_spec[3])
+                ml_xyy = munsell_specification_to_xyY(ml_spec_for_conversion)
+                ml_XYZ = xyY_to_XYZ(ml_xyy)
+                ml_Lab = XYZ_to_Lab(ml_XYZ, CCS_ILLUMINANT_MUNSELL)
+                delta_E = delta_E_CIE2000(reference_Lab[idx], ml_Lab)
+                delta_E_values.append(delta_E)
+            except (RuntimeError, ValueError):
+                # Skip if conversion fails
+                continue
+        result["delta_E"] = np.mean(delta_E_values) if delta_E_values else np.nan
+    else:
+        result["delta_E"] = np.nan
+    return result
+def generate_html_report(
+    results: dict[str, dict[str, Any]],
+    num_samples: int,
+    output_file: Path,
+    baseline_inference_time_ms: float | None = None,
+) -> None:
+    """Generate HTML report with visualizations."""
+    # Calculate metrics
+    avg_maes = {}
+    for model_name, result in results.items():
+        avg_maes[model_name] = np.mean(
+            [
+                result["hue_mae"],
+                result["value_mae"],
+                result["chroma_mae"],
+                result["code_mae"],
+            ]
+        )
+    # Sort by average MAE
+    sorted_models = sorted(avg_maes.items(), key=lambda x: x[1])
+    # Precision thresholds
+    thresholds = [1e-4, 1e-3, 1e-2, 1e-1, 1.0]
+    html = f"""<!DOCTYPE html>
+<html lang="en" class="dark">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ML Model Comparison Report - {datetime.now().strftime("%Y-%m-%d %H:%M")}</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script>
+        tailwind.config = {{
+            darkMode: 'class',
+            theme: {{
+                extend: {{
+                    colors: {{
+                        border: "hsl(240 3.7% 15.9%)",
+                        input: "hsl(240 3.7% 15.9%)",
+                        ring: "hsl(240 4.9% 83.9%)",
+                        background: "hsl(240 10% 3.9%)",
+                        foreground: "hsl(0 0% 98%)",
+                        primary: {{
+                            DEFAULT: "hsl(263 70% 60%)",
+                            foreground: "hsl(0 0% 98%)",
+                        }},
+                        secondary: {{
+                            DEFAULT: "hsl(240 3.7% 15.9%)",
+                            foreground: "hsl(0 0% 98%)",
+                        }},
+                        muted: {{
+                            DEFAULT: "hsl(240 3.7% 15.9%)",
+                            foreground: "hsl(240 5% 64.9%)",
+                        }},
+                        accent: {{
+                            DEFAULT: "hsl(240 3.7% 15.9%)",
+                            foreground: "hsl(0 0% 98%)",
+                        }},
+                        card: {{
+                            DEFAULT: "hsl(240 10% 6%)",
+                            foreground: "hsl(0 0% 98%)",
+                        }},
+                    }}
+                }}
+            }}
+        }}
+    </script>
+    <style>
+        .gradient-primary {{
+            background: linear-gradient(135deg, hsl(263 70% 50%) 0%, hsl(280 70% 45%) 100%);
+        }}
+        .bar-fill {{
+            background: linear-gradient(90deg, hsl(263 70% 60%) 0%, hsl(280 70% 55%) 100%);
+            transition: width 0.5s cubic-bezier(0.4, 0, 0.2, 1);
+        }}
+    </style>
+</head>
+<body class="bg-background text-foreground antialiased">
+    <div class="max-w-7xl mx-auto p-6 space-y-6">
+        <!-- Header -->
+        <div class="gradient-primary rounded-lg p-8 shadow-2xl border border-primary/20">
+            <h1 class="text-4xl font-bold text-white mb-2">ML Model Comparison Report</h1>
+            <div class="text-white/90 space-y-1">
+                <p class="text-lg">xyY to Munsell Specification Conversion</p>
+                <p class="text-sm">Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
+                <p class="text-sm">Test Samples: <span class="font-semibold">{num_samples:,}</span> real Munsell colors</p>
+            </div>
+        </div>
+"""
+    # Best Models Summary (FIRST - moved to top)
+    # Find best models for each metric
+    delta_E_values = [
+        r["delta_E"] for r in results.values() if not np.isnan(r["delta_E"])
+    ]
+    best_delta_E = (
+        min(
+            results.items(),
+            key=lambda x: x[1]["delta_E"]
+            if not np.isnan(x[1]["delta_E"])
+            else float("inf"),
+        )[0]
+        if delta_E_values
+        else None
+    )
+    best_avg = sorted_models[0][0]
+    # Performance Metrics Table (FIRST - as summary)
+    # Find best for each metric
+    best_size = min(results.items(), key=lambda x: x[1]["model_size_mb"])[0]
+    best_speed = min(results.items(), key=lambda x: x[1]["inference_time_ms"])[0]
+    # Add Best Models Summary HTML
+    html += f"""
+    <!-- Best Models Summary -->
+    <div class="bg-card rounded-lg border border-border p-6 shadow-lg">
+        <h2 class="text-2xl font-semibold mb-6 pb-3 border-b border-primary/30">Best Models by Metric</h2>
+        <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4">
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Smallest Size</div>
+                <div class="text-3xl font-bold text-primary mb-3">{results[best_size]["model_size_mb"]:.2f} MB</div>
+                <div class="text-sm text-foreground/80">{best_size}</div>
+            </div>
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Fastest Speed</div>
+                <div class="text-3xl font-bold text-primary mb-3">{results[best_speed]["inference_time_ms"]:.4f} ms</div>
+                <div class="text-sm text-foreground/80">{best_speed}</div>
+            </div>
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Best Delta-E</div>
+                <div class="text-3xl font-bold text-primary mb-3">{results[best_delta_E]["delta_E"]:.4f}</div>
+                <div class="text-sm text-foreground/80">{best_delta_E}</div>
+            </div>
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Best Average MAE</div>
+                <div class="text-3xl font-bold text-primary mb-3">{avg_maes[best_avg]:.4f}</div>
+                <div class="text-sm text-foreground/80">{best_avg}</div>
+            </div>
+        </div>
+    </div>
+"""
+    # Get baseline speed (Colour Library Iterative)
+    baseline_speed = baseline_inference_time_ms
+    # Sort by Delta-E for performance table (best first)
+    sorted_by_delta_E = sorted(
+        results.items(),
+        key=lambda x: x[1]["delta_E"]
+        if not np.isnan(x[1]["delta_E"])
+        else float("inf"),
+    )
+    # Calculate maximum speed multiplier (fastest model) for highlighting
+    max_speed_multiplier = 0.0
+    best_multiplier_model = None
+    for model_name, result in results.items():
+        speed_ms = result["inference_time_ms"]
+        if speed_ms > 0:
+            speed_multiplier = baseline_speed / speed_ms
+            if speed_multiplier > max_speed_multiplier:
+                max_speed_multiplier = speed_multiplier
+                best_multiplier_model = model_name
+    html += """
+    <!-- Performance Metrics Table -->
+    <div class="bg-card rounded-lg border border-border p-6 shadow-lg">
+        <h2 class="text-2xl font-semibold mb-6 pb-3 border-b border-primary/30">Model Performance Metrics</h2>
+        <div class="overflow-x-auto">
+            <table class="w-full text-sm">
+                <thead>
+                    <tr class="border-b border-border">
+                        <th class="text-left py-3 px-4 font-semibold text-muted-foreground">Model</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">
+                            Size (MB)
+                            <div class="text-xs font-normal text-muted-foreground/70 mt-1">ONNX files</div>
+                        </th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">
+                            Speed (ms/sample)
+                            <div class="text-xs font-normal text-muted-foreground/70 mt-1">10 iterations</div>
+                        </th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">
+                            vs Baseline
+                            <div class="text-xs font-normal text-muted-foreground/70 mt-1">Colour Iterative</div>
+                        </th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">
+                            Delta-E
+                            <div class="text-xs font-normal text-muted-foreground/70 mt-1">vs Colour Lib</div>
+                        </th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">Average MAE</th>
+                    </tr>
+                </thead>
+                <tbody>
+"""
+    for model_name, result in sorted_by_delta_E:
+        size_mb = result["model_size_mb"]
+        speed_ms = result["inference_time_ms"]
+        avg_mae = avg_maes[model_name]
+        delta_E = result["delta_E"]
+        # Calculate relative speed (how many times faster than baseline)
+        speed_multiplier = baseline_speed / speed_ms if speed_ms > 0 else 0
+        size_class = "text-primary font-semibold" if model_name == best_size else ""
+        speed_class = "text-primary font-semibold" if model_name == best_speed else ""
+        avg_class = "text-primary font-semibold" if model_name == best_avg else ""
+        delta_E_class = (
+            "text-primary font-semibold" if model_name == best_delta_E else ""
+        )
+        # Format Delta-E value
+        delta_E_str = f"{delta_E:.4f}" if not np.isnan(delta_E) else "—"
+        # Highlight only the fastest model
+        if abs(speed_multiplier - 1.0) < 0.01:
+            # Baseline
+            multiplier_class = "text-muted-foreground"
+            multiplier_text = "1.0x"
+        elif model_name == best_multiplier_model:
+            # Fastest model (highest multiplier)
+            multiplier_class = "text-primary font-semibold"
+            if speed_multiplier > 1000:
+                multiplier_text = f"{speed_multiplier:.0f}x"
+            elif speed_multiplier > 100:
+                multiplier_text = f"{speed_multiplier:.1f}x"
+            else:
+                multiplier_text = f"{speed_multiplier:.2f}x"
+        elif speed_multiplier > 1.0:
+            # Faster than baseline but not the fastest
+            multiplier_class = ""
+            if speed_multiplier > 1000:
+                multiplier_text = f"{speed_multiplier:.0f}x"
+            elif speed_multiplier > 100:
+                multiplier_text = f"{speed_multiplier:.1f}x"
+            else:
+                multiplier_text = f"{speed_multiplier:.2f}x"
+        else:
+            # Slower than baseline
+            multiplier_class = "text-destructive"
+            multiplier_text = f"{speed_multiplier:.2f}x"
+        html += f"""
+                    <tr class="border-b border-border/50 hover:bg-muted/30 transition-colors">
+                        <td class="py-3 px-4 font-medium">{model_name}</td>
+                        <td class="py-3 px-4 text-right {size_class}">{size_mb:.2f}</td>
+                        <td class="py-3 px-4 text-right {speed_class}">{speed_ms:.4f}</td>
+                        <td class="py-3 px-4 text-right {multiplier_class}">{multiplier_text}</td>
+                        <td class="py-3 px-4 text-right {delta_E_class}">{delta_E_str}</td>
+                        <td class="py-3 px-4 text-right {avg_class}">{avg_mae:.4f}</td>
+                    </tr>
+"""
+    html += """
+                </tbody>
+            </table>
+        </div>
+        <div class="mt-6 p-4 bg-muted/30 rounded-md border border-primary/20">
+            <div class="text-sm space-y-2">
+                <div><span class="text-primary font-semibold">Note:</span> Speed measured with 10 iterations (3 warmup + 10 benchmark) on 2,734 samples.</div>
+                <div class="text-xs text-muted-foreground">Two-stage models include both base and error predictor. Highlighted values show best in each metric.</div>
+                <div class="text-xs text-muted-foreground">Baseline comparison: Speed multipliers show relative performance vs Colour Library's iterative xyY_to_munsell_specification(). Values &lt;1.0x are faster.</div>
+            </div>
+        </div>
+    </div>
+"""
+    # Overall ranking by Delta-E
+    html += """
+        <!-- Overall Ranking -->
+        <div class="bg-card rounded-lg border border-border p-6 shadow-lg">
+            <h2 class="text-2xl font-semibold mb-4 pb-2 border-b border-primary/30">Overall Ranking (by Delta-E)</h2>
+            <div class="space-y-1">
+"""
+    # Sort by Delta-E (best = lowest)
+    sorted_by_delta_E_ranking = sorted(
+        [
+            (name, res["delta_E"])
+            for name, res in results.items()
+            if not np.isnan(res["delta_E"])
+        ],
+        key=lambda x: x[1],
+    )
+    max_delta_E = (
+        max(delta_E for _, delta_E in sorted_by_delta_E_ranking)
+        if sorted_by_delta_E_ranking
+        else 1.0
+    )
+    for rank, (model_name, delta_E) in enumerate(sorted_by_delta_E_ranking, 1):
+        width_pct = (delta_E / max_delta_E) * 100
+        html += f"""
+                <div class="flex items-center gap-3 p-2 rounded-md hover:bg-muted/50 transition-colors">
+                    <div class="flex-none w-80 text-sm font-medium">
+                        <span class="text-muted-foreground">{rank}.</span> {model_name}
+                    </div>
+                    <div class="flex-1 h-6 bg-muted rounded-md overflow-hidden">
+                        <div class="bar-fill h-full rounded-md" style="width: {width_pct}%"></div>
+                    </div>
+                    <div class="flex-none w-20 text-right font-bold text-primary">{delta_E:.4f}</div>
+                </div>
+"""
+    html += """
+            </div>
+        </div>
+"""
+    # Precision Threshold Table
+    html += """
+    <div class="bg-card rounded-lg border border-border p-6 shadow-lg">
+        <h2 class="text-2xl font-semibold mb-3 pb-3 border-b border-primary/30">Accuracy at Precision Thresholds</h2>
+        <p class="text-sm text-muted-foreground mb-6">Percentage of predictions where max error across all components is below threshold:</p>
+        <div class="overflow-x-auto">
+            <table class="w-full text-sm">
+                <thead>
+                    <tr class="border-b border-border">
+                        <th class="text-left py-3 px-4 font-semibold text-muted-foreground">Model</th>
+"""
+    for threshold in thresholds:
+        html += f'                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">&lt; {threshold:.0e}</th>\n'
+    html += """
+                    </tr>
+                </thead>
+                <tbody>
+"""
+    # Find best (highest) accuracy for each threshold column
+    best_accuracies = {}
+    min_accuracies = {}
+    for threshold in thresholds:
+        accuracies = [
+            np.mean(results[model_name]["max_errors"] < threshold) * 100
+            for model_name, _ in sorted_models
+        ]
+        best_accuracies[threshold] = max(accuracies)
+        min_accuracies[threshold] = min(accuracies)
+    for model_name, _ in sorted_models:
+        result = results[model_name]
+        row_class = (
+            "bg-primary/10 border-l-2 border-l-primary"
+            if model_name == best_avg
+            else ""
+        )
+        html += f"""
+                    <tr class="border-b border-border hover:bg-muted/30 transition-colors {row_class}">
+                        <td class="text-left py-3 px-4 font-medium">{model_name}</td>
+"""
+        for threshold in thresholds:
+            accuracy_pct = np.mean(result["max_errors"] < threshold) * 100
+            # Only highlight if there's meaningful variation
+            # (>0.1% difference between best and worst)
+            has_variation = (
+                best_accuracies[threshold] - min_accuracies[threshold]
+            ) > 0.1
+            is_best = abs(accuracy_pct - best_accuracies[threshold]) < 0.01
+            cell_class = (
+                "text-right py-3 px-4 font-bold text-primary"
+                if (has_variation and is_best)
+                else "text-right py-3 px-4"
+            )
+            html += f'                        <td class="{cell_class}">{accuracy_pct:.2f}%</td>\n'
+        html += """
+                    </tr>
+"""
+    html += """
+                </tbody>
+            </table>
+        </div>
+    </div>
+    </div>
+</body>
+</html>
+"""
+    # Write HTML file
+    with open(output_file, "w") as f:
+        f.write(html)
+    LOGGER.info("")
+    LOGGER.info("HTML report saved to: %s", output_file)
+def main() -> None:
+    """Compare all models."""
+    LOGGER.info("=" * 80)
+    LOGGER.info("Comprehensive Model Comparison")
+    LOGGER.info("=" * 80)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    # Load real Munsell dataset
+    LOGGER.info("")
+    LOGGER.info("Loading real Munsell dataset...")
+    xyY_samples = []
+    ground_truth = []
+    for munsell_spec_tuple, xyY in MUNSELL_COLOURS_REAL:
+        try:
+            hue_code, value, chroma = munsell_spec_tuple
+            munsell_str = f"{hue_code} {value}/{chroma}"
+            spec = munsell_colour_to_munsell_specification(munsell_str)
+            xyY_scaled = np.array([xyY[0], xyY[1], xyY[2] / 100.0])
+            xyY_samples.append(xyY_scaled)
+            ground_truth.append(spec)
+        except Exception:  # noqa: BLE001, S112
+            continue
+    xyY_samples = np.array(xyY_samples)
+    ground_truth = np.array(ground_truth)
+    LOGGER.info("Loaded %d valid Munsell colors", len(xyY_samples))
+    # Define models to compare
+    models = [
+        {
+            "name": "MLP (Base Only)",
+            "files": [model_directory / "mlp.onnx"],
+            "params_file": model_directory / "mlp_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "MLP + Error Predictor",
+            "files": [
+                model_directory / "mlp.onnx",
+                model_directory / "mlp_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "mlp_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Unified MLP",
+            "files": [model_directory / "unified_mlp.onnx"],
+            "params_file": model_directory / "unified_mlp_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "MLP + Self-Attention",
+            "files": [model_directory / "mlp_attention.onnx"],
+            "params_file": model_directory
+            / "mlp_attention_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "MLP + Self-Attention + Error Predictor",
+            "files": [
+                model_directory / "mlp_attention.onnx",
+                model_directory / "mlp_attention_error_predictor.onnx",
+            ],
+            "params_file": model_directory
+            / "mlp_attention_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Deep + Wide",
+            "files": [model_directory / "deep_wide.onnx"],
+            "params_file": model_directory / "deep_wide_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Mixture of Experts",
+            "files": [model_directory / "mixture_of_experts.onnx"],
+            "params_file": model_directory
+            / "mixture_of_experts_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "FT-Transformer",
+            "files": [model_directory / "ft_transformer.onnx"],
+            "params_file": model_directory / "ft_transformer_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head",
+            "files": [model_directory / "multi_head.onnx"],
+            "params_file": model_directory / "multi_head_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head (Optimized)",
+            "files": [model_directory / "multi_head_optimized.onnx"],
+            "params_file": model_directory / "multi_head_optimized_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head + Error Predictor",
+            "files": [
+                model_directory / "multi_head.onnx",
+                model_directory / "multi_head_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_head_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP",
+            "files": [model_directory / "multi_mlp.onnx"],
+            "params_file": model_directory / "multi_mlp_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-MLP + Error Predictor",
+            "files": [
+                model_directory / "multi_mlp.onnx",
+                model_directory / "multi_mlp_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_mlp_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP + Multi-Error Predictor",
+            "files": [
+                model_directory / "multi_mlp.onnx",
+                model_directory / "multi_mlp_multi_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_mlp_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP + Multi-Error Predictor (Optimized)",
+            "files": [
+                model_directory / "multi_mlp.onnx",
+                model_directory / "multi_mlp_multi_error_predictor_optimized.onnx",
+            ],
+            "params_file": model_directory / "multi_mlp_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP (Optimized)",
+            "files": [model_directory / "multi_mlp_optimized.onnx"],
+            "params_file": model_directory / "multi_mlp_optimized_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head + Multi-Error Predictor",
+            "files": [
+                model_directory / "multi_head.onnx",
+                model_directory / "multi_head_multi_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_head_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-Head + Cross-Attention Error Predictor",
+            "files": [
+                model_directory / "multi_head.onnx",
+                model_directory / "multi_head_cross_attention_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_head_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-Head (Optimized) + Multi-Error Predictor (Optimized)",
+            "files": [
+                model_directory / "multi_head_optimized.onnx",
+                model_directory / "multi_head_error_predictor_optimized.onnx",
+            ],
+            "params_file": model_directory / "multi_head_optimized_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-Head (Circular Loss)",
+            "files": [model_directory / "multi_head_circular.onnx"],
+            "params_file": model_directory / "multi_head_circular_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head (Large Dataset)",
+            "files": [model_directory / "multi_head_large.onnx"],
+            "params_file": model_directory / "multi_head_large_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head + Multi-Error Predictor (Large Dataset)",
+            "files": [
+                model_directory / "multi_head_large.onnx",
+                model_directory / "multi_head_multi_error_predictor_large.onnx",
+            ],
+            "params_file": model_directory / "multi_head_large_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP (Large Dataset)",
+            "files": [model_directory / "multi_mlp_large.onnx"],
+            "params_file": model_directory / "multi_mlp_large_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-MLP + Multi-Error Predictor (Large Dataset)",
+            "files": [
+                model_directory / "multi_mlp_large.onnx",
+                model_directory / "multi_mlp_multi_error_predictor_large.onnx",
+            ],
+            "params_file": model_directory / "multi_mlp_large_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Transformer (Large Dataset)",
+            "files": [model_directory / "transformer_large.onnx"],
+            "params_file": model_directory / "transformer_large_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Transformer + Error Predictor (Large Dataset)",
+            "files": [
+                model_directory / "transformer_large.onnx",
+                model_directory / "transformer_multi_error_predictor_large.onnx",
+            ],
+            "params_file": model_directory / "transformer_large_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-Head Refined (REAL Only)",
+            "files": [model_directory / "multi_head_refined_real.onnx"],
+            "params_file": model_directory / "multi_head_refined_real_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head Refined + Error Predictor (REAL Only)",
+            "files": [
+                model_directory / "multi_head_refined_real.onnx",
+                model_directory / "multi_head_multi_error_predictor_refined_real.onnx",
+            ],
+            "params_file": model_directory / "multi_head_refined_real_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-Head + Multi-Error Predictor + Multi-Error Predictor (3-Stage)",
+            "files": [
+                model_directory / "multi_head_large.onnx",
+                model_directory / "multi_head_multi_error_predictor_large.onnx",
+                model_directory / "multi_head_3stage_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_head_large_normalization_params.npz",
+            "type": "three_stage",
+        },
+        {
+            "name": "Multi-Head (Weighted + Boundary Loss)",
+            "files": [model_directory / "multi_head_weighted_boundary.onnx"],
+            "params_file": model_directory / "multi_head_weighted_boundary_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head (Weighted + Boundary Loss) + Multi-Error Predictor",
+            "files": [
+                model_directory / "multi_head_weighted_boundary.onnx",
+                model_directory / "multi_head_weighted_boundary_multi_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_head_weighted_boundary_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-Head (Weighted + Boundary Loss) + Multi-Error Predictor (Weighted + Boundary Loss)",
+            "files": [
+                model_directory / "multi_head_weighted_boundary.onnx",
+                model_directory / "multi_head_weighted_boundary_multi_error_predictor_weighted_boundary.onnx",
+            ],
+            "params_file": model_directory / "multi_head_weighted_boundary_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP (Weighted + Boundary Loss) (Large Dataset)",
+            "files": [model_directory / "multi_mlp_weighted_boundary.onnx"],
+            "params_file": model_directory / "multi_mlp_weighted_boundary_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-MLP (Weighted + Boundary Loss) + Multi-Error Predictor (Weighted + Boundary Loss) (Large Dataset)",
+            "files": [
+                model_directory / "multi_mlp_weighted_boundary.onnx",
+                model_directory / "multi_mlp_weighted_boundary_multi_error_predictor.onnx",
+            ],
+            "params_file": model_directory / "multi_mlp_weighted_boundary_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-ResNet (Large Dataset)",
+            "files": [model_directory / "multi_resnet_large.onnx"],
+            "params_file": model_directory / "multi_resnet_large_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-ResNet + Multi-Error Predictor (Large Dataset)",
+            "files": [
+                model_directory / "multi_resnet_large.onnx",
+                model_directory / "multi_resnet_error_predictor_large.onnx",
+            ],
+            "params_file": model_directory / "multi_resnet_large_normalization_params.npz",
+            "type": "two_stage",
+        },
+    ]
+    # Benchmark colour library's iterative implementation first
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Colour Library (Iterative)")
+    LOGGER.info("=" * 80)
+    # Benchmark the iterative xyY_to_munsell_specification function
+    # Note: Using full dataset (100% of samples)
+    # Set random seed for reproducibility
+    np.random.seed(42)
+    # Use 100% of samples for comprehensive benchmarking
+    sample_count = len(xyY_samples)
+    sampled_indices = np.arange(len(xyY_samples))
+    xyY_benchmark_samples = xyY_samples[sampled_indices]
+    # Measure inference time on sampled Munsell colors
+    start_time = time.perf_counter()
+    convergence_failures = 0
+    successful_inferences = 0
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        for xyy in xyY_benchmark_samples:
+            try:
+                xyY_to_munsell_specification(xyy)
+                successful_inferences += 1
+            except (RuntimeError, ValueError):
+                # Out-of-gamut color that doesn't converge or not in renotation system
+                convergence_failures += 1
+    end_time = time.perf_counter()
+    # Calculate average time per successful inference (in milliseconds)
+    total_time_s = end_time - start_time
+    colour_inference_time_ms = (
+        (total_time_s / successful_inferences) * 1000
+        if successful_inferences > 0
+        else 0
+    )
+    LOGGER.info("")
+    LOGGER.info("Performance Metrics:")
+    LOGGER.info("  Successful inferences: %d", successful_inferences)
+    LOGGER.info("  Convergence failures: %d", convergence_failures)
+    LOGGER.info("  Inference Speed: %.4f ms/sample", colour_inference_time_ms)
+    LOGGER.info("  Note: This is the baseline iterative implementation")
+    # Store the baseline speed
+    baseline_inference_time_ms = colour_inference_time_ms
+    # Convert ground truth Munsell specs to CIE Lab for Delta-E comparison
+    # Path: Munsell spec → xyY → XYZ → Lab
+    LOGGER.info("")
+    LOGGER.info(
+        "Converting ground truth to CIE Lab for Delta-E comparison..."
+    )
+    LOGGER.info("  Path: Munsell spec \u2192 xyY \u2192 XYZ \u2192 Lab")
+    reference_Lab = []
+    for spec in ground_truth:
+        try:
+            # Munsell specification → xyY
+            xyy = munsell_specification_to_xyY(spec)
+            # xyY → XYZ
+            XYZ = xyY_to_XYZ(xyy)
+            # XYZ → Lab (Illuminant C for Munsell)
+            Lab = XYZ_to_Lab(XYZ, CCS_ILLUMINANT_MUNSELL)
+            reference_Lab.append(Lab)
+        except (RuntimeError, ValueError):
+            # If conversion fails, use NaN
+            reference_Lab.append(np.array([np.nan, np.nan, np.nan]))
+    reference_Lab = np.array(reference_Lab)
+    LOGGER.info(
+        "  Converted %d ground truth specs to CIE Lab",
+        len(reference_Lab),
+    )
+    # Use the same sampled subset for ML model evaluations (for fair comparison)
+    xyY_samples = xyY_benchmark_samples
+    ground_truth = ground_truth[sampled_indices]
+    # Evaluate each model
+    results = {}
+    for model_info in models:
+        model_name = model_info["name"]
+        LOGGER.info("")
+        LOGGER.info("=" * 80)
+        LOGGER.info(model_name)
+        LOGGER.info("=" * 80)
+        # Load normalization params for this model
+        params = np.load(model_info["params_file"], allow_pickle=True)
+        # input_params may not exist if xyY is already in [0, 1] range
+        input_params = (
+            params["input_params"].item()
+            if "input_params" in params.files
+            else None
+        )
+        output_params = params["output_params"].item()
+        # Normalize input with this model's params (None means no normalization)
+        X_norm = normalize_input(xyY_samples, input_params)
+        # Calculate model size
+        model_size_mb = get_model_size_mb(model_info["files"])
+        if model_info["type"] == "two_stage":
+            # Two-stage model
+            base_session = ort.InferenceSession(str(model_info["files"][0]))
+            error_session = ort.InferenceSession(str(model_info["files"][1]))
+            # Define inference callable for benchmarking
+            def two_stage_inference(
+                _base_session: ort.InferenceSession = base_session,
+                _error_session: ort.InferenceSession = error_session,
+                _X_norm: NDArray = X_norm,
+            ) -> NDArray:
+                base_pred = _base_session.run(None, {"xyY": _X_norm})[0]
+                combined = np.concatenate([_X_norm, base_pred], axis=1).astype(
+                    np.float32
+                )
+                error_corr = _error_session.run(None, {"combined_input": combined})[
+                    0
+                ]
+                return base_pred + error_corr
+            # Benchmark speed
+            inference_time_ms = benchmark_inference_speed(
+                two_stage_inference, X_norm
+            )
+            # Get predictions
+            base_pred_norm = base_session.run(None, {"xyY": X_norm})[0]
+            combined_input = np.concatenate(
+                [X_norm, base_pred_norm], axis=1
+            ).astype(np.float32)
+            error_correction_norm = error_session.run(
+                None, {"combined_input": combined_input}
+            )[0]
+            final_pred_norm = base_pred_norm + error_correction_norm
+            pred = denormalize_output(final_pred_norm, output_params)
+            errors = np.abs(pred - ground_truth)
+            result = {
+                "hue_mae": np.mean(errors[:, 0]),
+                "value_mae": np.mean(errors[:, 1]),
+                "chroma_mae": np.mean(errors[:, 2]),
+                "code_mae": np.mean(errors[:, 3]),
+                "max_errors": np.max(errors, axis=1),
+                "hue_errors": errors[:, 0],
+                "value_errors": errors[:, 1],
+                "chroma_errors": errors[:, 2],
+                "code_errors": errors[:, 3],
+                "model_size_mb": model_size_mb,
+                "inference_time_ms": inference_time_ms,
+            }
+            # Compute Delta-E against ground truth
+            delta_E_values = []
+            for idx in range(len(pred)):
+                try:
+                    # Convert ML prediction to Lab: Munsell spec → xyY → XYZ → Lab
+                    ml_spec = clamp_munsell_specification(pred[idx])
+                    # Round Code to nearest integer before round-trip conversion
+                    ml_spec_for_conversion = ml_spec.copy()
+                    ml_spec_for_conversion[3] = round(ml_spec[3])
+                    ml_xyy = munsell_specification_to_xyY(ml_spec_for_conversion)
+                    ml_XYZ = xyY_to_XYZ(ml_xyy)
+                    ml_Lab = XYZ_to_Lab(ml_XYZ, CCS_ILLUMINANT_MUNSELL)
+                    # Get ground truth Lab
+                    reference_Lab_sample = reference_Lab[idx]
+                    # Compute Delta-E CIE2000
+                    delta_E = delta_E_CIE2000(reference_Lab_sample, ml_Lab)
+                    delta_E_values.append(delta_E)
+                except (RuntimeError, ValueError):
+                    # Skip if conversion fails
+                    continue
+            result["delta_E"] = (
+                np.mean(delta_E_values) if delta_E_values else np.nan
+            )
+        elif model_info["type"] == "three_stage":
+            # Three-stage model: base + error predictor 1 + error predictor 2
+            base_session = ort.InferenceSession(str(model_info["files"][0]))
+            error1_session = ort.InferenceSession(str(model_info["files"][1]))
+            error2_session = ort.InferenceSession(str(model_info["files"][2]))
+            # Define inference callable for benchmarking
+            def three_stage_inference(
+                _base_session: ort.InferenceSession = base_session,
+                _error1_session: ort.InferenceSession = error1_session,
+                _error2_session: ort.InferenceSession = error2_session,
+                _X_norm: NDArray = X_norm,
+            ) -> NDArray:
+                # Stage 1: Base model
+                base_pred = _base_session.run(None, {"xyY": _X_norm})[0]
+                # Stage 2: First error correction
+                combined1 = np.concatenate([_X_norm, base_pred], axis=1).astype(
+                    np.float32
+                )
+                error1_corr = _error1_session.run(
+                    None, {"combined_input": combined1}
+                )[0]
+                stage2_pred = base_pred + error1_corr
+                # Stage 3: Second error correction
+                combined2 = np.concatenate([_X_norm, stage2_pred], axis=1).astype(
+                    np.float32
+                )
+                error2_corr = _error2_session.run(
+                    None, {"combined_input": combined2}
+                )[0]
+                return stage2_pred + error2_corr
+            # Benchmark speed
+            inference_time_ms = benchmark_inference_speed(
+                three_stage_inference, X_norm
+            )
+            # Get predictions
+            base_pred_norm = base_session.run(None, {"xyY": X_norm})[0]
+            combined1 = np.concatenate([X_norm, base_pred_norm], axis=1).astype(
+                np.float32
+            )
+            error1_corr_norm = error1_session.run(
+                None, {"combined_input": combined1}
+            )[0]
+            stage2_pred_norm = base_pred_norm + error1_corr_norm
+            combined2 = np.concatenate([X_norm, stage2_pred_norm], axis=1).astype(
+                np.float32
+            )
+            error2_corr_norm = error2_session.run(
+                None, {"combined_input": combined2}
+            )[0]
+            final_pred_norm = stage2_pred_norm + error2_corr_norm
+            pred = denormalize_output(final_pred_norm, output_params)
+            errors = np.abs(pred - ground_truth)
+            result = {
+                "hue_mae": np.mean(errors[:, 0]),
+                "value_mae": np.mean(errors[:, 1]),
+                "chroma_mae": np.mean(errors[:, 2]),
+                "code_mae": np.mean(errors[:, 3]),
+                "max_errors": np.max(errors, axis=1),
+                "hue_errors": errors[:, 0],
+                "value_errors": errors[:, 1],
+                "chroma_errors": errors[:, 2],
+                "code_errors": errors[:, 3],
+                "model_size_mb": model_size_mb,
+                "inference_time_ms": inference_time_ms,
+            }
+            # Compute Delta-E against ground truth for three-stage model
+            delta_E_values = []
+            for idx in range(len(pred)):
+                try:
+                    ml_spec = clamp_munsell_specification(pred[idx])
+                    ml_spec_for_conversion = ml_spec.copy()
+                    ml_spec_for_conversion[3] = round(ml_spec[3])
+                    ml_xyy = munsell_specification_to_xyY(ml_spec_for_conversion)
+                    ml_XYZ = xyY_to_XYZ(ml_xyy)
+                    ml_Lab = XYZ_to_Lab(ml_XYZ, CCS_ILLUMINANT_MUNSELL)
+                    delta_E = delta_E_CIE2000(reference_Lab[idx], ml_Lab)
+                    delta_E_values.append(delta_E)
+                except (RuntimeError, ValueError):
+                    continue
+            result["delta_E"] = (
+                np.mean(delta_E_values) if delta_E_values else np.nan
+            )
+        else:
+            # Single model
+            session = ort.InferenceSession(str(model_info["files"][0]))
+            # Define inference callable for benchmarking
+            def single_inference(
+                _session: ort.InferenceSession = session, _X_norm: NDArray = X_norm
+            ) -> NDArray:
+                return _session.run(None, {"xyY": _X_norm})[0]
+            # Benchmark speed
+            inference_time_ms = benchmark_inference_speed(single_inference, X_norm)
+            result = evaluate_model(
+                session,
+                X_norm,
+                ground_truth,
+                output_params,
+                reference_Lab=reference_Lab,
+            )
+            result["model_size_mb"] = model_size_mb
+            result["inference_time_ms"] = inference_time_ms
+        results[model_name] = result
+        # Print results
+        LOGGER.info("")
+        LOGGER.info("Mean Absolute Errors:")
+        LOGGER.info("  Hue:    %.4f", result["hue_mae"])
+        LOGGER.info("  Value:  %.4f", result["value_mae"])
+        LOGGER.info("  Chroma: %.4f", result["chroma_mae"])
+        LOGGER.info("  Code:   %.4f", result["code_mae"])
+        if not np.isnan(result["delta_E"]):
+            LOGGER.info("  Delta-E (vs Ground Truth): %.4f", result["delta_E"])
+        LOGGER.info("")
+        LOGGER.info("Performance Metrics:")
+        LOGGER.info("  Model Size: %.2f MB", result["model_size_mb"])
+        LOGGER.info(
+            "  Inference Speed: %.4f ms/sample", result["inference_time_ms"]
+        )
+    # Summary comparison
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("SUMMARY COMPARISON")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    if not results:
+        LOGGER.info("⚠️  No models were successfully evaluated")
+        return
+    # MAE comparison table
+    LOGGER.info("Mean Absolute Error Comparison:")
+    LOGGER.info("")
+    header = "{:<35} {:>8} {:>8} {:>8} {:>8} {:>10}".format(
+        "Model",
+        "Hue",
+        "Value",
+        "Chroma",
+        "Code",
+        "Delta-E",
+    )
+    LOGGER.info(header)
+    LOGGER.info("-" * 90)
+    for model_name, result in results.items():
+        delta_E_str = (
+            f"{result['delta_E']:.4f}" if not np.isnan(result["delta_E"]) else "N/A"
+        )
+        LOGGER.info(
+            "%-35s %8.4f %8.4f %8.4f %8.4f %10s",
+            model_name[:35],
+            result["hue_mae"],
+            result["value_mae"],
+            result["chroma_mae"],
+            result["code_mae"],
+            delta_E_str,
+        )
+    # Precision threshold comparison
+    LOGGER.info("")
+    LOGGER.info("Accuracy at Precision Thresholds:")
+    LOGGER.info("")
+    thresholds = [1e-4, 1e-3, 1e-2, 1e-1, 1.0]
+    header_parts = [f"{'Model/Threshold':<35}"]
+    header_parts.extend(f"{f'< {threshold:.0e}':>10}" for threshold in thresholds)
+    LOGGER.info(" ".join(header_parts))
+    LOGGER.info("-" * 80)
+    for model_name, result in results.items():
+        row_parts = [f"{model_name[:35]:<35}"]
+        for threshold in thresholds:
+            accuracy_pct = np.mean(result["max_errors"] < threshold) * 100
+            row_parts.append(f"{accuracy_pct:9.2f}%")
+        LOGGER.info(" ".join(row_parts))
+    # Performance metrics comparison
+    LOGGER.info("")
+    LOGGER.info("Model Size and Inference Speed Comparison:")
+    LOGGER.info("")
+    header = f"{'Model':<35} {'Size (MB)':>12} {'Speed (ms/sample)':>18}"
+    LOGGER.info(header)
+    LOGGER.info("-" * 80)
+    for model_name, result in results.items():
+        LOGGER.info(
+            "%-35s %11.2f %17.4f",
+            model_name[:35],
+            result["model_size_mb"],
+            result["inference_time_ms"],
+        )
+    # Find best model
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("BEST MODELS BY METRIC")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    metrics = ["hue_mae", "value_mae", "chroma_mae", "code_mae"]
+    metric_names = ["Hue MAE", "Value MAE", "Chroma MAE", "Code MAE"]
+    for metric, metric_name in zip(metrics, metric_names, strict=False):
+        best_model = min(results.items(), key=lambda x: x[1][metric])
+        LOGGER.info(
+            "%-15s: %s (%.4f)",
+            metric_name,
+            best_model[0],
+            best_model[1][metric],
+        )
+    # Overall best (average rank)
+    LOGGER.info("")
+    LOGGER.info("Overall Best (by average component MAE):")
+    for model_name, result in results.items():
+        avg_mae = np.mean(
+            [
+                result["hue_mae"],
+                result["value_mae"],
+                result["chroma_mae"],
+                result["code_mae"],
+            ]
+        )
+        LOGGER.info("  %s: %.4f", model_name, avg_mae)
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    # Generate HTML report
+    report_dir = PROJECT_ROOT / "reports" / "from_xyY"
+    report_dir.mkdir(exist_ok=True)
+    report_file = report_dir / "model_comparison.html"
+    generate_html_report(
+        results, len(xyY_samples), report_file, baseline_inference_time_ms
+    )
+if __name__ == "__main__":
+    main()

learning_munsell/comparison/from_xyY/compare_gamma_model.py ADDED Viewed

	@@ -0,0 +1,390 @@

+"""
+Quick comparison of the gamma-corrected models against baselines.
+This script compares:
+1. MLP (Base) vs MLP (Gamma 2.33)
+2. Multi-Head (Base) vs Multi-Head (Gamma 2.33) vs Multi-Head (ST.2084)
+"""
+import logging
+from typing import Any
+import numpy as np
+import onnxruntime as ort
+from colour import XYZ_to_Lab, xyY_to_XYZ
+from colour.difference import delta_E_CIE2000
+from colour.models import eotf_inverse_ST2084
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_REAL
+from colour.notation.munsell import (
+    CCS_ILLUMINANT_MUNSELL,
+    munsell_colour_to_munsell_specification,
+    munsell_specification_to_xyY,
+)
+from numpy.typing import NDArray
+from learning_munsell import PROJECT_ROOT
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+def normalize_input_standard(X: NDArray, params: dict[str, Any]) -> NDArray:
+    """Standard xyY normalization."""
+    X_norm = np.copy(X)
+    X_norm[..., 0] = (X[..., 0] - params["x_range"][0]) / (
+        params["x_range"][1] - params["x_range"][0]
+    )
+    X_norm[..., 1] = (X[..., 1] - params["y_range"][0]) / (
+        params["y_range"][1] - params["y_range"][0]
+    )
+    X_norm[..., 2] = (X[..., 2] - params["Y_range"][0]) / (
+        params["Y_range"][1] - params["Y_range"][0]
+    )
+    return X_norm.astype(np.float32)
+def normalize_input_gamma(X: NDArray, params: dict[str, Any]) -> NDArray:
+    """Gamma-corrected xyY normalization."""
+    gamma = params.get("gamma", 2.33)
+    X_norm = np.copy(X)
+    X_norm[..., 0] = (X[..., 0] - params["x_range"][0]) / (
+        params["x_range"][1] - params["x_range"][0]
+    )
+    X_norm[..., 1] = (X[..., 1] - params["y_range"][0]) / (
+        params["y_range"][1] - params["y_range"][0]
+    )
+    # Normalize Y then apply gamma
+    Y_normalized = (X[..., 2] - params["Y_range"][0]) / (
+        params["Y_range"][1] - params["Y_range"][0]
+    )
+    Y_normalized = np.clip(Y_normalized, 0, 1)
+    X_norm[..., 2] = np.power(Y_normalized, 1.0 / gamma)
+    return X_norm.astype(np.float32)
+def normalize_input_st2084(X: NDArray, params: dict[str, Any]) -> NDArray:
+    """ST.2084 (PQ) encoded xyY normalization."""
+    L_p = params.get("L_p", 100.0)
+    X_norm = np.copy(X)
+    X_norm[..., 0] = (X[..., 0] - params["x_range"][0]) / (
+        params["x_range"][1] - params["x_range"][0]
+    )
+    X_norm[..., 1] = (X[..., 1] - params["y_range"][0]) / (
+        params["y_range"][1] - params["y_range"][0]
+    )
+    # Normalize Y then apply ST.2084
+    Y_normalized = (X[..., 2] - params["Y_range"][0]) / (
+        params["Y_range"][1] - params["Y_range"][0]
+    )
+    Y_normalized = np.clip(Y_normalized, 0, 1)
+    # Scale to cd/m² and apply ST.2084 inverse EOTF
+    Y_cdm2 = Y_normalized * L_p
+    X_norm[..., 2] = eotf_inverse_ST2084(Y_cdm2, L_p=L_p)
+    return X_norm.astype(np.float32)
+def denormalize_output(y_norm: NDArray, params: dict[str, Any]) -> NDArray:
+    """Denormalize Munsell output."""
+    y = np.copy(y_norm)
+    y[..., 0] = (
+        y_norm[..., 0] * (params["hue_range"][1] - params["hue_range"][0])
+        + params["hue_range"][0]
+    )
+    y[..., 1] = (
+        y_norm[..., 1] * (params["value_range"][1] - params["value_range"][0])
+        + params["value_range"][0]
+    )
+    y[..., 2] = (
+        y_norm[..., 2] * (params["chroma_range"][1] - params["chroma_range"][0])
+        + params["chroma_range"][0]
+    )
+    y[..., 3] = (
+        y_norm[..., 3] * (params["code_range"][1] - params["code_range"][0])
+        + params["code_range"][0]
+    )
+    return y
+def clamp_munsell_specification(spec: NDArray) -> NDArray:
+    """Clamp Munsell specification to valid ranges."""
+    clamped = np.copy(spec)
+    clamped[..., 0] = np.clip(spec[..., 0], 0.0, 10.0)  # Hue: [0, 10]
+    clamped[..., 1] = np.clip(spec[..., 1], 1.0, 9.0)  # Value: [1, 9] (colour library constraint)
+    clamped[..., 2] = np.clip(spec[..., 2], 0.0, 50.0)  # Chroma: [0, 50]
+    clamped[..., 3] = np.clip(spec[..., 3], 1.0, 10.0)  # Code: [1, 10]
+    return clamped
+def compute_delta_e(pred: NDArray, reference_Lab: NDArray) -> list[float]:
+    """Compute Delta-E for predictions."""
+    delta_E_values = []
+    for idx in range(len(pred)):
+        try:
+            ml_spec = clamp_munsell_specification(pred[idx])
+            ml_spec_for_conversion = ml_spec.copy()
+            ml_spec_for_conversion[3] = round(ml_spec[3])
+            ml_xyy = munsell_specification_to_xyY(ml_spec_for_conversion)
+            ml_XYZ = xyY_to_XYZ(ml_xyy)
+            ml_Lab = XYZ_to_Lab(ml_XYZ, CCS_ILLUMINANT_MUNSELL)
+            delta_E = delta_E_CIE2000(reference_Lab[idx], ml_Lab)
+            delta_E_values.append(delta_E)
+        except (RuntimeError, ValueError):
+            continue
+    return delta_E_values
+def main() -> None:
+    """Compare gamma model against baseline."""
+    LOGGER.info("=" * 80)
+    LOGGER.info("Gamma Model Comparison: MLP vs MLP (Gamma 2.33)")
+    LOGGER.info("=" * 80)
+    models_dir = PROJECT_ROOT / "models" / "from_xyY"
+    # Load real Munsell data
+    LOGGER.info("\nLoading real Munsell colours...")
+    xyY_values = []
+    munsell_specs = []
+    reference_Lab = []
+    for munsell_spec_tuple, xyY in MUNSELL_COLOURS_REAL:
+        try:
+            hue_code, value, chroma = munsell_spec_tuple
+            munsell_str = f"{hue_code} {value}/{chroma}"
+            spec = munsell_colour_to_munsell_specification(munsell_str)
+            xyY_scaled = np.array([xyY[0], xyY[1], xyY[2] / 100.0])
+            XYZ = xyY_to_XYZ(xyY_scaled)
+            Lab = XYZ_to_Lab(XYZ, CCS_ILLUMINANT_MUNSELL)
+            xyY_values.append(xyY_scaled)
+            munsell_specs.append(spec)
+            reference_Lab.append(Lab)
+        except (RuntimeError, ValueError):
+            continue
+    xyY_array = np.array(xyY_values)
+    ground_truth = np.array(munsell_specs)
+    reference_Lab = np.array(reference_Lab)
+    LOGGER.info("Loaded %d real Munsell colours", len(xyY_array))
+    # Test baseline MLP
+    LOGGER.info("\n" + "-" * 40)
+    LOGGER.info("1. MLP (Base) - Standard Normalization")
+    LOGGER.info("-" * 40)
+    base_onnx = models_dir / "mlp.onnx"
+    base_params_file = models_dir / "mlp_normalization_params.npz"
+    if base_onnx.exists() and base_params_file.exists():
+        base_session = ort.InferenceSession(str(base_onnx))
+        base_params_data = np.load(base_params_file, allow_pickle=True)
+        base_input_params = base_params_data["input_params"].item()
+        base_output_params = base_params_data["output_params"].item()
+        X_norm_base = normalize_input_standard(xyY_array, base_input_params)
+        pred_norm = base_session.run(None, {"xyY": X_norm_base})[0]
+        pred_base = denormalize_output(pred_norm, base_output_params)
+        errors_base = np.abs(pred_base - ground_truth)
+        delta_E_base = compute_delta_e(pred_base, reference_Lab)
+        LOGGER.info("  Hue MAE:    %.4f", np.mean(errors_base[:, 0]))
+        LOGGER.info("  Value MAE:  %.4f", np.mean(errors_base[:, 1]))
+        LOGGER.info("  Chroma MAE: %.4f", np.mean(errors_base[:, 2]))
+        LOGGER.info("  Code MAE:   %.4f", np.mean(errors_base[:, 3]))
+        LOGGER.info("  Delta-E:    %.4f (mean), %.4f (median)",
+                    np.mean(delta_E_base), np.median(delta_E_base))
+    else:
+        LOGGER.info("  Model not found, skipping...")
+        delta_E_base = []
+    # Test gamma MLP
+    LOGGER.info("\n" + "-" * 40)
+    LOGGER.info("2. MLP (Gamma 2.33) - Gamma-Corrected Y")
+    LOGGER.info("-" * 40)
+    gamma_onnx = models_dir / "mlp_gamma.onnx"
+    gamma_params_file = models_dir / "mlp_gamma_normalization_params.npz"
+    if gamma_onnx.exists() and gamma_params_file.exists():
+        gamma_session = ort.InferenceSession(str(gamma_onnx))
+        gamma_params_data = np.load(gamma_params_file, allow_pickle=True)
+        gamma_input_params = gamma_params_data["input_params"].item()
+        gamma_output_params = gamma_params_data["output_params"].item()
+        X_norm_gamma = normalize_input_gamma(xyY_array, gamma_input_params)
+        pred_norm = gamma_session.run(None, {"xyY_gamma": X_norm_gamma})[0]
+        pred_gamma = denormalize_output(pred_norm, gamma_output_params)
+        errors_gamma = np.abs(pred_gamma - ground_truth)
+        delta_E_gamma = compute_delta_e(pred_gamma, reference_Lab)
+        LOGGER.info("  Hue MAE:    %.4f", np.mean(errors_gamma[:, 0]))
+        LOGGER.info("  Value MAE:  %.4f", np.mean(errors_gamma[:, 1]))
+        LOGGER.info("  Chroma MAE: %.4f", np.mean(errors_gamma[:, 2]))
+        LOGGER.info("  Code MAE:   %.4f", np.mean(errors_gamma[:, 3]))
+        LOGGER.info("  Delta-E:    %.4f (mean), %.4f (median)",
+                    np.mean(delta_E_gamma), np.median(delta_E_gamma))
+    else:
+        LOGGER.info("  Model not found, skipping...")
+        delta_E_gamma = []
+    # Summary comparison for MLP
+    if delta_E_base and delta_E_gamma:
+        LOGGER.info("\n" + "=" * 80)
+        LOGGER.info("MLP COMPARISON SUMMARY")
+        LOGGER.info("=" * 80)
+        LOGGER.info("")
+        LOGGER.info("Delta-E (lower is better):")
+        LOGGER.info("  MLP (Base):      %.4f mean, %.4f median",
+                    np.mean(delta_E_base), np.median(delta_E_base))
+        LOGGER.info("  MLP (Gamma):     %.4f mean, %.4f median",
+                    np.mean(delta_E_gamma), np.median(delta_E_gamma))
+        LOGGER.info("")
+        improvement = (np.mean(delta_E_base) - np.mean(delta_E_gamma)) / np.mean(delta_E_base) * 100
+        if improvement > 0:
+            LOGGER.info("  Gamma model is %.1f%% BETTER", improvement)
+        else:
+            LOGGER.info("  Gamma model is %.1f%% WORSE", -improvement)
+    # Test Multi-Head baseline
+    LOGGER.info("\n" + "=" * 80)
+    LOGGER.info("MULTI-HEAD GAMMA EXPERIMENT")
+    LOGGER.info("=" * 80)
+    LOGGER.info("\n" + "-" * 40)
+    LOGGER.info("3. Multi-Head (Base) - Standard Normalization")
+    LOGGER.info("-" * 40)
+    mh_base_onnx = models_dir / "multi_head.onnx"
+    mh_base_params_file = models_dir / "multi_head_normalization_params.npz"
+    if mh_base_onnx.exists() and mh_base_params_file.exists():
+        mh_base_session = ort.InferenceSession(str(mh_base_onnx))
+        mh_base_params_data = np.load(mh_base_params_file, allow_pickle=True)
+        mh_base_input_params = mh_base_params_data["input_params"].item()
+        mh_base_output_params = mh_base_params_data["output_params"].item()
+        X_norm_mh_base = normalize_input_standard(xyY_array, mh_base_input_params)
+        pred_norm = mh_base_session.run(None, {"xyY": X_norm_mh_base})[0]
+        pred_mh_base = denormalize_output(pred_norm, mh_base_output_params)
+        errors_mh_base = np.abs(pred_mh_base - ground_truth)
+        delta_E_mh_base = compute_delta_e(pred_mh_base, reference_Lab)
+        LOGGER.info("  Hue MAE:    %.4f", np.mean(errors_mh_base[:, 0]))
+        LOGGER.info("  Value MAE:  %.4f", np.mean(errors_mh_base[:, 1]))
+        LOGGER.info("  Chroma MAE: %.4f", np.mean(errors_mh_base[:, 2]))
+        LOGGER.info("  Code MAE:   %.4f", np.mean(errors_mh_base[:, 3]))
+        LOGGER.info("  Delta-E:    %.4f (mean), %.4f (median)",
+                    np.mean(delta_E_mh_base), np.median(delta_E_mh_base))
+    else:
+        LOGGER.info("  Model not found, skipping...")
+        delta_E_mh_base = []
+    # Test Multi-Head gamma
+    LOGGER.info("\n" + "-" * 40)
+    LOGGER.info("4. Multi-Head (Gamma 2.33) - Gamma-Corrected Y")
+    LOGGER.info("-" * 40)
+    mh_gamma_onnx = models_dir / "multi_head_gamma.onnx"
+    mh_gamma_params_file = models_dir / "multi_head_gamma_normalization_params.npz"
+    if mh_gamma_onnx.exists() and mh_gamma_params_file.exists():
+        mh_gamma_session = ort.InferenceSession(str(mh_gamma_onnx))
+        mh_gamma_params_data = np.load(mh_gamma_params_file, allow_pickle=True)
+        mh_gamma_input_params = mh_gamma_params_data["input_params"].item()
+        mh_gamma_output_params = mh_gamma_params_data["output_params"].item()
+        X_norm_mh_gamma = normalize_input_gamma(xyY_array, mh_gamma_input_params)
+        pred_norm = mh_gamma_session.run(None, {"xyY_gamma": X_norm_mh_gamma})[0]
+        pred_mh_gamma = denormalize_output(pred_norm, mh_gamma_output_params)
+        errors_mh_gamma = np.abs(pred_mh_gamma - ground_truth)
+        delta_E_mh_gamma = compute_delta_e(pred_mh_gamma, reference_Lab)
+        LOGGER.info("  Hue MAE:    %.4f", np.mean(errors_mh_gamma[:, 0]))
+        LOGGER.info("  Value MAE:  %.4f", np.mean(errors_mh_gamma[:, 1]))
+        LOGGER.info("  Chroma MAE: %.4f", np.mean(errors_mh_gamma[:, 2]))
+        LOGGER.info("  Code MAE:   %.4f", np.mean(errors_mh_gamma[:, 3]))
+        LOGGER.info("  Delta-E:    %.4f (mean), %.4f (median)",
+                    np.mean(delta_E_mh_gamma), np.median(delta_E_mh_gamma))
+    else:
+        LOGGER.info("  Model not found, skipping...")
+        delta_E_mh_gamma = []
+    # Test Multi-Head ST.2084
+    LOGGER.info("\n" + "-" * 40)
+    LOGGER.info("5. Multi-Head (ST.2084) - PQ-Encoded Y")
+    LOGGER.info("-" * 40)
+    mh_st2084_onnx = models_dir / "multi_head_st2084.onnx"
+    mh_st2084_params_file = models_dir / "multi_head_st2084_normalization_params.npz"
+    if mh_st2084_onnx.exists() and mh_st2084_params_file.exists():
+        mh_st2084_session = ort.InferenceSession(str(mh_st2084_onnx))
+        mh_st2084_params_data = np.load(mh_st2084_params_file, allow_pickle=True)
+        mh_st2084_input_params = mh_st2084_params_data["input_params"].item()
+        mh_st2084_output_params = mh_st2084_params_data["output_params"].item()
+        X_norm_mh_st2084 = normalize_input_st2084(xyY_array, mh_st2084_input_params)
+        pred_norm = mh_st2084_session.run(None, {"xyY_st2084": X_norm_mh_st2084})[0]
+        pred_mh_st2084 = denormalize_output(pred_norm, mh_st2084_output_params)
+        errors_mh_st2084 = np.abs(pred_mh_st2084 - ground_truth)
+        delta_E_mh_st2084 = compute_delta_e(pred_mh_st2084, reference_Lab)
+        LOGGER.info("  Hue MAE:    %.4f", np.mean(errors_mh_st2084[:, 0]))
+        LOGGER.info("  Value MAE:  %.4f", np.mean(errors_mh_st2084[:, 1]))
+        LOGGER.info("  Chroma MAE: %.4f", np.mean(errors_mh_st2084[:, 2]))
+        LOGGER.info("  Code MAE:   %.4f", np.mean(errors_mh_st2084[:, 3]))
+        LOGGER.info("  Delta-E:    %.4f (mean), %.4f (median)",
+                    np.mean(delta_E_mh_st2084), np.median(delta_E_mh_st2084))
+    else:
+        LOGGER.info("  Model not found, skipping...")
+        delta_E_mh_st2084 = []
+    # Summary comparison for Multi-Head
+    if delta_E_mh_base and delta_E_mh_gamma:
+        LOGGER.info("\n" + "=" * 80)
+        LOGGER.info("MULTI-HEAD COMPARISON SUMMARY")
+        LOGGER.info("=" * 80)
+        LOGGER.info("")
+        LOGGER.info("Delta-E (lower is better):")
+        LOGGER.info("  Multi-Head (Base):    %.4f mean, %.4f median",
+                    np.mean(delta_E_mh_base), np.median(delta_E_mh_base))
+        LOGGER.info("  Multi-Head (Gamma):   %.4f mean, %.4f median",
+                    np.mean(delta_E_mh_gamma), np.median(delta_E_mh_gamma))
+        if delta_E_mh_st2084:
+            LOGGER.info("  Multi-Head (ST.2084): %.4f mean, %.4f median",
+                        np.mean(delta_E_mh_st2084), np.median(delta_E_mh_st2084))
+        LOGGER.info("")
+        mh_gamma_improvement = (np.mean(delta_E_mh_base) - np.mean(delta_E_mh_gamma)) / np.mean(delta_E_mh_base) * 100
+        if mh_gamma_improvement > 0:
+            LOGGER.info("  Multi-Head Gamma vs Base: %.1f%% BETTER", mh_gamma_improvement)
+        else:
+            LOGGER.info("  Multi-Head Gamma vs Base: %.1f%% WORSE", -mh_gamma_improvement)
+        if delta_E_mh_st2084:
+            mh_st2084_improvement = (np.mean(delta_E_mh_base) - np.mean(delta_E_mh_st2084)) / np.mean(delta_E_mh_base) * 100
+            if mh_st2084_improvement > 0:
+                LOGGER.info("  Multi-Head ST.2084 vs Base: %.1f%% BETTER", mh_st2084_improvement)
+            else:
+                LOGGER.info("  Multi-Head ST.2084 vs Base: %.1f%% WORSE", -mh_st2084_improvement)
+            # Compare ST.2084 vs Gamma
+            st2084_vs_gamma = (np.mean(delta_E_mh_gamma) - np.mean(delta_E_mh_st2084)) / np.mean(delta_E_mh_gamma) * 100
+            if st2084_vs_gamma > 0:
+                LOGGER.info("  Multi-Head ST.2084 vs Gamma: %.1f%% BETTER", st2084_vs_gamma)
+            else:
+                LOGGER.info("  Multi-Head ST.2084 vs Gamma: %.1f%% WORSE", -st2084_vs_gamma)
+    LOGGER.info("\n" + "=" * 80)
+if __name__ == "__main__":
+    main()

learning_munsell/comparison/to_xyY/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Comparison scripts for Munsell to xyY conversion models."""

learning_munsell/comparison/to_xyY/compare_all_models.py ADDED Viewed

	@@ -0,0 +1,617 @@

+"""
+Compare all ML models for Munsell to xyY conversion on real Munsell data.
+Models to compare:
+1. Simple MLP Approximator
+2. Multi-Head MLP
+3. Multi-Head MLP (Optimized) - with hyperparameter optimization
+4. Multi-Head + Multi-Error Predictor
+5. Multi-MLP - 3 independent branches
+6. Multi-MLP (Optimized) - 3 independent branches with optimized hyperparameters
+7. Multi-MLP + Error Predictor
+8. Multi-MLP + Multi-Error Predictor
+9. Multi-MLP (Optimized) + Multi-Error Predictor (Optimized)
+"""
+from __future__ import annotations
+import logging
+import time
+import warnings
+from typing import TYPE_CHECKING
+import numpy as np
+import onnxruntime as ort
+from colour import XYZ_to_Lab, xyY_to_XYZ
+from colour.difference import delta_E_CIE2000
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_REAL
+from colour.notation.munsell import (
+    CCS_ILLUMINANT_MUNSELL,
+    munsell_colour_to_munsell_specification,
+    munsell_specification_to_xyY,
+)
+from numpy.typing import NDArray  # noqa: TC002
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.utilities.common import (
+    benchmark_inference_speed,
+    generate_html_report_footer,
+    generate_html_report_header,
+    generate_ranking_section,
+    get_model_size_mb,
+)
+if TYPE_CHECKING:
+    from pathlib import Path
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+def normalize_munsell(munsell: np.ndarray) -> np.ndarray:
+    """Normalize Munsell specs to [0, 1] range."""
+    normalized = munsell.copy()
+    normalized[..., 0] = munsell[..., 0] / 10.0  # Hue (in decade)
+    normalized[..., 1] = munsell[..., 1] / 10.0  # Value
+    normalized[..., 2] = munsell[..., 2] / 50.0  # Chroma
+    normalized[..., 3] = munsell[..., 3] / 10.0  # Code
+    return normalized.astype(np.float32)
+def evaluate_model(
+    session: ort.InferenceSession,
+    X_norm: np.ndarray,
+    ground_truth: np.ndarray,
+    input_name: str = "munsell_normalized",
+) -> dict:
+    """Evaluate a single model."""
+    pred = session.run(None, {input_name: X_norm})[0]
+    errors = np.abs(pred - ground_truth)
+    return {
+        "x_mae": np.mean(errors[:, 0]),
+        "y_mae": np.mean(errors[:, 1]),
+        "Y_mae": np.mean(errors[:, 2]),
+        "predictions": pred,
+        "errors": errors,
+        "max_errors": np.max(errors, axis=1),
+    }
+def compute_delta_E(
+    ml_predictions: np.ndarray,
+    reference_xyY: np.ndarray,
+) -> float:
+    """Compute Delta-E CIE2000 between ML predictions and reference xyY (ground truth)."""
+    delta_E_values = []
+    for ml_xyY, ref_xyY in zip(ml_predictions, reference_xyY, strict=False):
+        try:
+            ml_XYZ = xyY_to_XYZ(ml_xyY)
+            ml_Lab = XYZ_to_Lab(ml_XYZ, CCS_ILLUMINANT_MUNSELL)
+            ref_XYZ = xyY_to_XYZ(ref_xyY)
+            ref_Lab = XYZ_to_Lab(ref_XYZ, CCS_ILLUMINANT_MUNSELL)
+            delta_E = delta_E_CIE2000(ref_Lab, ml_Lab)
+            if not np.isnan(delta_E):
+                delta_E_values.append(delta_E)
+        except (RuntimeError, ValueError):
+            continue
+    return np.mean(delta_E_values) if delta_E_values else np.nan
+def generate_html_report(
+    results: dict,
+    num_samples: int,
+    output_file: Path,
+    baseline_inference_time_ms: float,
+) -> None:
+    """Generate HTML report with visualizations."""
+    # Calculate average MAE
+    avg_maes = {}
+    for model_name, result in results.items():
+        avg_maes[model_name] = np.mean(
+            [
+                result["x_mae"],
+                result["y_mae"],
+                result["Y_mae"],
+            ]
+        )
+    # Sort by average MAE
+    sorted_models = sorted(avg_maes.items(), key=lambda x: x[1])
+    # Start HTML
+    html = generate_html_report_header(
+        title="ML Model Comparison Report",
+        subtitle="Munsell to xyY Conversion",
+        num_samples=num_samples,
+    )
+    # Best Models Summary
+    best_size = min(results.items(), key=lambda x: x[1]["model_size_mb"])[0]
+    best_speed = min(results.items(), key=lambda x: x[1]["inference_time_ms"])[0]
+    best_avg = sorted_models[0][0]
+    # Find best Delta-E
+    delta_E_results = [
+        (n, r["delta_E"]) for n, r in results.items() if not np.isnan(r["delta_E"])
+    ]
+    best_delta_E = (
+        min(delta_E_results, key=lambda x: x[1])[0] if delta_E_results else None
+    )
+    html += f"""
+    <!-- Best Models Summary -->
+    <div class="bg-card rounded-lg border border-border p-6 shadow-lg">
+        <h2 class="text-2xl font-semibold mb-6 pb-3 border-b border-primary/30">Best Models by Metric</h2>
+        <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4">
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Smallest Size</div>
+                <div class="text-3xl font-bold text-primary mb-3">{results[best_size]["model_size_mb"]:.2f} MB</div>
+                <div class="text-sm text-foreground/80">{best_size}</div>
+            </div>
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Fastest Speed</div>
+                <div class="text-3xl font-bold text-primary mb-3">{results[best_speed]["inference_time_ms"]:.4f} ms</div>
+                <div class="text-sm text-foreground/80">{best_speed}</div>
+            </div>
+"""
+    if best_delta_E:
+        html += f"""
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Best Delta-E</div>
+                <div class="text-3xl font-bold text-primary mb-3">{results[best_delta_E]["delta_E"]:.6f}</div>
+                <div class="text-sm text-foreground/80">{best_delta_E}</div>
+            </div>
+"""
+    html += f"""
+            <div class="bg-gradient-to-br from-primary/10 to-primary/5 rounded-lg p-5 border border-primary/20">
+                <div class="text-xs font-semibold text-muted-foreground uppercase tracking-wide mb-2">Best Average MAE</div>
+                <div class="text-3xl font-bold text-primary mb-3">{avg_maes[best_avg]:.6f}</div>
+                <div class="text-sm text-foreground/80">{best_avg}</div>
+            </div>
+        </div>
+    </div>
+"""
+    # Performance Metrics Table
+    sorted_by_avg_mae = sorted(results.items(), key=lambda x: avg_maes[x[0]])
+    html += """
+    <!-- Performance Metrics Table -->
+    <div class="bg-card rounded-lg border border-border p-6 shadow-lg">
+        <h2 class="text-2xl font-semibold mb-6 pb-3 border-b border-primary/30">Model Performance Metrics</h2>
+        <div class="overflow-x-auto">
+            <table class="w-full text-sm">
+                <thead>
+                    <tr class="border-b border-border">
+                        <th class="text-left py-3 px-4 font-semibold text-muted-foreground">Model</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">Size (MB)</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">Speed (ms/sample)</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">vs Baseline</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">MAE x</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">MAE y</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">MAE Y</th>
+                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">Delta-E</th>
+                    </tr>
+                </thead>
+                <tbody>
+"""
+    for model_name, result in sorted_by_avg_mae:
+        size_mb = result["model_size_mb"]
+        speed_ms = result["inference_time_ms"]
+        delta_E = result["delta_E"]
+        # Calculate speedup vs baseline
+        speedup = baseline_inference_time_ms / speed_ms if speed_ms > 0 else 0
+        size_class = "text-primary font-semibold" if model_name == best_size else ""
+        speed_class = "text-primary font-semibold" if model_name == best_speed else ""
+        delta_E_class = (
+            "text-primary font-semibold" if model_name == best_delta_E else ""
+        )
+        delta_E_str = f"{delta_E:.6f}" if not np.isnan(delta_E) else "—"
+        speedup_text = f"{speedup:.0f}x" if speedup > 100 else f"{speedup:.1f}x"
+        html += f"""
+                    <tr class="border-b border-border/50 hover:bg-muted/30 transition-colors">
+                        <td class="py-3 px-4 font-medium">{model_name}</td>
+                        <td class="py-3 px-4 text-right {size_class}">{size_mb:.2f}</td>
+                        <td class="py-3 px-4 text-right {speed_class}">{speed_ms:.4f}</td>
+                        <td class="py-3 px-4 text-right text-primary font-semibold">{speedup_text}</td>
+                        <td class="py-3 px-4 text-right">{result["x_mae"]:.6f}</td>
+                        <td class="py-3 px-4 text-right">{result["y_mae"]:.6f}</td>
+                        <td class="py-3 px-4 text-right">{result["Y_mae"]:.6f}</td>
+                        <td class="py-3 px-4 text-right {delta_E_class}">{delta_E_str}</td>
+                    </tr>
+"""
+    html += """
+                </tbody>
+            </table>
+        </div>
+    </div>
+"""
+    # Add ranking section
+    html += generate_ranking_section(
+        results,
+        metric_key="avg_mae",
+        title="Overall Ranking (by Average MAE)",
+    )
+    # Precision thresholds
+    thresholds = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+    html += """
+    <div class="bg-card rounded-lg border border-border p-6 shadow-lg">
+        <h2 class="text-2xl font-semibold mb-3 pb-3 border-b border-primary/30">Accuracy at Precision Thresholds</h2>
+        <p class="text-sm text-muted-foreground mb-6">Percentage of predictions where max error across all components is below threshold:</p>
+        <div class="overflow-x-auto">
+            <table class="w-full text-sm">
+                <thead>
+                    <tr class="border-b border-border">
+                        <th class="text-left py-3 px-4 font-semibold text-muted-foreground">Model</th>
+"""
+    for threshold in thresholds:
+        html += f'                        <th class="text-right py-3 px-4 font-semibold text-muted-foreground">&lt; {threshold:.0e}</th>\n'
+    html += """
+                    </tr>
+                </thead>
+                <tbody>
+"""
+    for model_name, _ in sorted_models:
+        result = results[model_name]
+        html += f"""
+                    <tr class="border-b border-border hover:bg-muted/30 transition-colors">
+                        <td class="text-left py-3 px-4 font-medium">{model_name}</td>
+"""
+        for threshold in thresholds:
+            accuracy_pct = np.mean(result["max_errors"] < threshold) * 100
+            html += f'                        <td class="text-right py-3 px-4">{accuracy_pct:.2f}%</td>\n'
+        html += """
+                    </tr>
+"""
+    html += """
+                </tbody>
+            </table>
+        </div>
+    </div>
+"""
+    html += generate_html_report_footer()
+    # Write HTML file
+    with open(output_file, "w") as f:
+        f.write(html)
+    LOGGER.info("")
+    LOGGER.info("HTML report saved to: %s", output_file)
+def main() -> None:
+    """Compare all models."""
+    LOGGER.info("=" * 80)
+    LOGGER.info("Munsell to xyY Model Comparison")
+    LOGGER.info("=" * 80)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "to_xyY"
+    # Load real Munsell dataset
+    LOGGER.info("")
+    LOGGER.info("Loading real Munsell dataset...")
+    munsell_specs = []
+    xyY_ground_truth = []
+    for munsell_spec_tuple, xyY in MUNSELL_COLOURS_REAL:
+        try:
+            hue_code, value, chroma = munsell_spec_tuple
+            munsell_str = f"{hue_code} {value}/{chroma}"
+            spec = munsell_colour_to_munsell_specification(munsell_str)
+            xyY_scaled = np.array([xyY[0], xyY[1], xyY[2] / 100.0])
+            munsell_specs.append(spec)
+            xyY_ground_truth.append(xyY_scaled)
+        except Exception:  # noqa: BLE001, S112
+            continue
+    munsell_specs = np.array(munsell_specs, dtype=np.float32)
+    xyY_ground_truth = np.array(xyY_ground_truth, dtype=np.float32)
+    LOGGER.info("Loaded %d valid Munsell colors", len(munsell_specs))
+    # Normalize inputs
+    munsell_normalized = normalize_munsell(munsell_specs)
+    # Benchmark colour library first
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Colour Library (munsell_specification_to_xyY)")
+    LOGGER.info("=" * 80)
+    # Benchmark the munsell_specification_to_xyY function
+    # Note: Using full dataset (100% of samples)
+    # Set random seed for reproducibility
+    np.random.seed(42)
+    # Use 100% of samples for comprehensive benchmarking
+    sampled_indices = np.arange(len(munsell_specs))
+    munsell_benchmark = munsell_specs[sampled_indices]
+    start_time = time.perf_counter()
+    colour_predictions = []
+    successful_inferences = 0
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        for spec in munsell_benchmark:
+            try:
+                xyY = munsell_specification_to_xyY(spec)
+                colour_predictions.append(xyY)
+                successful_inferences += 1
+            except (RuntimeError, ValueError):
+                colour_predictions.append(np.array([np.nan, np.nan, np.nan]))
+    end_time = time.perf_counter()
+    total_time_s = end_time - start_time
+    baseline_inference_time_ms = (
+        (total_time_s / successful_inferences) * 1000
+        if successful_inferences > 0
+        else 0
+    )
+    colour_predictions = np.array(colour_predictions)
+    LOGGER.info("  Successful inferences: %d", successful_inferences)
+    LOGGER.info("  Inference Speed: %.4f ms/sample", baseline_inference_time_ms)
+    # Define models to compare
+    models = [
+        {
+            "name": "Simple MLP",
+            "files": [model_directory / "munsell_to_xyY_approximator.onnx"],
+            "params_file": model_directory
+            / "munsell_to_xyY_approximator_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head",
+            "files": [model_directory / "multi_head.onnx"],
+            "params_file": model_directory / "multi_head_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head (Optimized)",
+            "files": [model_directory / "multi_head_optimized.onnx"],
+            "params_file": model_directory
+            / "multi_head_optimized_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-Head + Multi-Error Predictor",
+            "files": [
+                model_directory / "multi_head.onnx",
+                model_directory / "multi_head_multi_error_predictor.onnx",
+            ],
+            "params_file": model_directory
+            / "multi_head_multi_error_predictor_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP",
+            "files": [model_directory / "multi_mlp.onnx"],
+            "params_file": model_directory / "multi_mlp_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-MLP (Optimized)",
+            "files": [model_directory / "multi_mlp_optimized.onnx"],
+            "params_file": model_directory
+            / "multi_mlp_optimized_normalization_params.npz",
+            "type": "single",
+        },
+        {
+            "name": "Multi-MLP + Error Predictor",
+            "files": [
+                model_directory / "multi_mlp.onnx",
+                model_directory / "multi_mlp_error_predictor.onnx",
+            ],
+            "params_file": model_directory
+            / "multi_mlp_error_predictor_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP + Multi-Error Predictor",
+            "files": [
+                model_directory / "multi_mlp.onnx",
+                model_directory / "multi_mlp_multi_error_predictor.onnx",
+            ],
+            "params_file": model_directory
+            / "multi_mlp_multi_error_predictor_normalization_params.npz",
+            "type": "two_stage",
+        },
+        {
+            "name": "Multi-MLP (Optimized) + Multi-Error Predictor (Optimized)",
+            "files": [
+                model_directory / "multi_mlp_optimized.onnx",
+                model_directory / "multi_mlp_multi_error_predictor_optimized.onnx",
+            ],
+            "params_file": model_directory
+            / "multi_mlp_multi_error_predictor_optimized_normalization_params.npz",
+            "type": "two_stage",
+        },
+    ]
+    # Evaluate each model
+    results = {}
+    for model_info in models:
+        model_name = model_info["name"]
+        LOGGER.info("")
+        LOGGER.info("=" * 80)
+        LOGGER.info(model_name)
+        LOGGER.info("=" * 80)
+        # Calculate model size
+        model_size_mb = get_model_size_mb(model_info["files"])
+        if model_info["type"] == "two_stage":
+            # Two-stage model
+            base_session = ort.InferenceSession(str(model_info["files"][0]))
+            error_session = ort.InferenceSession(str(model_info["files"][1]))
+            error_input_name = error_session.get_inputs()[0].name
+            # Define inference callable
+            def two_stage_inference(
+                _base_session: ort.InferenceSession = base_session,
+                _error_session: ort.InferenceSession = error_session,
+                _munsell_normalized: NDArray = munsell_normalized,
+                _error_input_name: str = error_input_name,
+            ) -> NDArray:
+                base_pred = _base_session.run(
+                    None, {"munsell_normalized": _munsell_normalized}
+                )[0]
+                combined = np.concatenate(
+                    [_munsell_normalized, base_pred], axis=1
+                ).astype(np.float32)
+                error_corr = _error_session.run(
+                    None, {_error_input_name: combined}
+                )[0]
+                return base_pred + error_corr
+            # Benchmark speed
+            inference_time_ms = benchmark_inference_speed(
+                two_stage_inference, munsell_normalized
+            )
+            # Get predictions
+            base_pred = base_session.run(
+                None, {"munsell_normalized": munsell_normalized}
+            )[0]
+            combined = np.concatenate(
+                [munsell_normalized, base_pred], axis=1
+            ).astype(np.float32)
+            error_corr = error_session.run(
+                None, {error_input_name: combined}
+            )[0]
+            pred = base_pred + error_corr
+            errors = np.abs(pred - xyY_ground_truth)
+            result = {
+                "x_mae": np.mean(errors[:, 0]),
+                "y_mae": np.mean(errors[:, 1]),
+                "Y_mae": np.mean(errors[:, 2]),
+                "predictions": pred,
+                "errors": errors,
+                "max_errors": np.max(errors, axis=1),
+            }
+        else:
+            # Single model
+            session = ort.InferenceSession(str(model_info["files"][0]))
+            # Define inference callable
+            def single_inference(
+                _session: ort.InferenceSession = session,
+                _munsell_normalized: NDArray = munsell_normalized,
+            ) -> NDArray:
+                return _session.run(
+                    None, {"munsell_normalized": _munsell_normalized}
+                )[0]
+            # Benchmark speed
+            inference_time_ms = benchmark_inference_speed(
+                single_inference, munsell_normalized
+            )
+            result = evaluate_model(session, munsell_normalized, xyY_ground_truth)
+        result["model_size_mb"] = model_size_mb
+        result["inference_time_ms"] = inference_time_ms
+        result["avg_mae"] = np.mean(
+            [result["x_mae"], result["y_mae"], result["Y_mae"]]
+        )
+        # Compute Delta-E against ground truth (measured xyY)
+        sampled_predictions = result["predictions"][sampled_indices]
+        result["delta_E"] = compute_delta_E(
+            sampled_predictions,
+            xyY_ground_truth,
+        )
+        results[model_name] = result
+        # Print results
+        LOGGER.info("")
+        LOGGER.info("Mean Absolute Errors:")
+        LOGGER.info("  x: %.6f", result["x_mae"])
+        LOGGER.info("  y: %.6f", result["y_mae"])
+        LOGGER.info("  Y: %.6f", result["Y_mae"])
+        if not np.isnan(result["delta_E"]):
+            LOGGER.info("  Delta-E (vs Ground Truth): %.6f", result["delta_E"])
+        LOGGER.info("")
+        LOGGER.info("Performance Metrics:")
+        LOGGER.info("  Model Size: %.2f MB", result["model_size_mb"])
+        LOGGER.info(
+            "  Inference Speed: %.4f ms/sample", result["inference_time_ms"]
+        )
+        LOGGER.info(
+            "  Speedup vs Colour: %.1fx",
+            baseline_inference_time_ms / inference_time_ms,
+        )
+    # Summary
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("SUMMARY COMPARISON")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    if not results:
+        LOGGER.info("No models were successfully evaluated")
+        return
+    # MAE comparison table
+    LOGGER.info("Mean Absolute Error Comparison:")
+    LOGGER.info("")
+    header = f"{'Model':<40} {'x':>10} {'y':>10} {'Y':>10} {'Delta-E':>12}"
+    LOGGER.info(header)
+    LOGGER.info("-" * 85)
+    for model_name, result in results.items():
+        delta_E_str = (
+            f"{result['delta_E']:.6f}" if not np.isnan(result["delta_E"]) else "N/A"
+        )
+        LOGGER.info(
+            "%-40s %10.6f %10.6f %10.6f %12s",
+            model_name,
+            result["x_mae"],
+            result["y_mae"],
+            result["Y_mae"],
+            delta_E_str,
+        )
+    # Generate HTML report
+    report_dir = PROJECT_ROOT / "reports" / "to_xyY"
+    report_dir.mkdir(parents=True, exist_ok=True)
+    report_file = report_dir / "model_comparison.html"
+    generate_html_report(
+        results, len(munsell_specs), report_file, baseline_inference_time_ms
+    )
+if __name__ == "__main__":
+    main()

learning_munsell/data_generation/generate_training_data.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+Generate training data for ML-based xyY to Munsell conversion.
+Generates samples by sampling in Munsell space and converting to xyY via
+forward conversion, guaranteeing 100% valid samples.
+Usage:
+    uv run python -m learning_munsell.data_generation.generate_training_data
+    uv run python -m learning_munsell.data_generation.generate_training_data \\
+        --n-samples 2000000 --perturbation 0.10 --output training_data_large
+"""
+import argparse
+import json
+import logging
+import multiprocessing as mp
+import warnings
+from datetime import datetime, timezone
+import numpy as np
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_ALL
+from colour.notation.munsell import (
+    munsell_colour_to_munsell_specification,
+    munsell_specification_to_xyY,
+)
+from colour.utilities import ColourUsageWarning
+from numpy.typing import NDArray
+from sklearn.model_selection import train_test_split
+from learning_munsell import PROJECT_ROOT
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+def _worker_generate_samples(
+    args: tuple[int, NDArray, int, float],
+) -> tuple[list[NDArray], list[NDArray]]:
+    """
+    Worker function to generate samples in parallel.
+    Parameters
+    ----------
+    args : tuple
+        - worker_id: Worker identifier
+        - base_specs: Array of base Munsell specifications
+        - samples_per_base: Number of samples to generate per base color
+        - perturbation_pct: Perturbation percentage
+    Returns
+    -------
+    tuple
+        - xyY_samples: List of xyY arrays
+        - munsell_samples: List of Munsell specification arrays
+    """
+    worker_id, base_specs, samples_per_base, perturbation_pct = args
+    np.random.seed(42 + worker_id)
+    warnings.filterwarnings("ignore", category=ColourUsageWarning)
+    warnings.filterwarnings("ignore", category=RuntimeWarning)
+    xyY_samples = []
+    munsell_samples = []
+    hue_range = 9.5
+    value_range = 9.0
+    chroma_range = 50.0
+    for base_spec in base_specs:
+        for _ in range(samples_per_base):
+            hue_delta = np.random.uniform(
+                -perturbation_pct * hue_range, perturbation_pct * hue_range
+            )
+            value_delta = np.random.uniform(
+                -perturbation_pct * value_range, perturbation_pct * value_range
+            )
+            chroma_delta = np.random.uniform(
+                -perturbation_pct * chroma_range, perturbation_pct * chroma_range
+            )
+            perturbed_spec = base_spec.copy()
+            perturbed_spec[0] = np.clip(base_spec[0] + hue_delta, 0.5, 10.0)
+            perturbed_spec[1] = np.clip(base_spec[1] + value_delta, 1.0, 10.0)
+            perturbed_spec[2] = np.clip(base_spec[2] + chroma_delta, 0.0, 50.0)
+            try:
+                xyY = munsell_specification_to_xyY(perturbed_spec)
+                xyY_samples.append(xyY)
+                munsell_samples.append(perturbed_spec)
+            except Exception:  # noqa: BLE001, S110
+                continue
+    return xyY_samples, munsell_samples
+def generate_forward_munsell_samples(
+    n_samples: int = 500000,
+    perturbation_pct: float = 0.05,
+    n_workers: int | None = None,
+) -> tuple[NDArray, NDArray]:
+    """
+    Generate samples by sampling directly in Munsell space and converting to xyY.
+    Parameters
+    ----------
+    n_samples : int
+        Target number of samples to generate.
+    perturbation_pct : float
+        Perturbation as percentage of valid range.
+    n_workers : int, optional
+        Number of parallel workers. Defaults to CPU count.
+    Returns
+    -------
+    tuple
+        - xyY_samples: Array of shape (n, 3) with xyY values
+        - munsell_samples: Array of shape (n, 4) with Munsell specifications
+    """
+    if n_workers is None:
+        n_workers = mp.cpu_count()
+    LOGGER.info(
+        "Generating %d samples with %.0f%% perturbations using %d workers...",
+        n_samples,
+        perturbation_pct * 100,
+        n_workers,
+    )
+    # Extract base Munsell specifications
+    base_specs = []
+    for munsell_spec_tuple, _ in MUNSELL_COLOURS_ALL:
+        hue_code_str, value, chroma = munsell_spec_tuple
+        munsell_str = f"{hue_code_str} {value}/{chroma}"
+        spec = munsell_colour_to_munsell_specification(munsell_str)
+        base_specs.append(spec)
+    base_specs = np.array(base_specs)
+    samples_per_base = n_samples // len(base_specs) + 1
+    LOGGER.info("Using %d base Munsell colors", len(base_specs))
+    LOGGER.info("Generating ~%d samples per base color", samples_per_base)
+    # Split base specs across workers
+    specs_per_worker = len(base_specs) // n_workers
+    worker_args = []
+    for i in range(n_workers):
+        start_idx = i * specs_per_worker
+        end_idx = start_idx + specs_per_worker if i < n_workers - 1 else len(base_specs)
+        worker_specs = base_specs[start_idx:end_idx]
+        worker_args.append((i, worker_specs, samples_per_base, perturbation_pct))
+    # Run in parallel
+    LOGGER.info("Starting %d parallel workers...", n_workers)
+    with mp.Pool(n_workers) as pool:
+        results = pool.map(_worker_generate_samples, worker_args)
+    # Combine results
+    all_xyY = []
+    all_munsell = []
+    for xyY_samples, munsell_samples in results:
+        all_xyY.extend(xyY_samples)
+        all_munsell.extend(munsell_samples)
+    # Trim to exact sample count
+    all_xyY = all_xyY[:n_samples]
+    all_munsell = all_munsell[:n_samples]
+    LOGGER.info("Generated %d valid samples", len(all_xyY))
+    return np.array(all_xyY), np.array(all_munsell)
+def main(
+    n_samples: int = 500000,
+    perturbation_pct: float = 0.05,
+    output: str = "training_data",
+) -> None:
+    """Generate and save training data."""
+    LOGGER.info("=" * 80)
+    LOGGER.info("Training Data Generation")
+    LOGGER.info("=" * 80)
+    output_dir = PROJECT_ROOT / "data"
+    output_dir.mkdir(exist_ok=True)
+    LOGGER.info("")
+    LOGGER.info("SAMPLING STRATEGY")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Forward Munsell->xyY sampling:")
+    LOGGER.info(
+        "  - Base: %d colors from MUNSELL_COLOURS_ALL", len(MUNSELL_COLOURS_ALL)
+    )
+    LOGGER.info(
+        "  - Perturbations: +/-%.0f%% of valid range per component",
+        perturbation_pct * 100,
+    )
+    LOGGER.info(
+        "    - Hue: +/-%.2f (+/-%.0f%% of 9.5 range)",
+        perturbation_pct * 9.5,
+        perturbation_pct * 100,
+    )
+    LOGGER.info(
+        "    - Value: +/-%.2f (+/-%.0f%% of 9.0 range)",
+        perturbation_pct * 9.0,
+        perturbation_pct * 100,
+    )
+    LOGGER.info(
+        "    - Chroma: +/-%.1f (+/-%.0f%% of 50.0 range)",
+        perturbation_pct * 50.0,
+        perturbation_pct * 100,
+    )
+    LOGGER.info("  - Target samples: %d", n_samples)
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    # Generate samples
+    xyY_all, munsell_all = generate_forward_munsell_samples(
+        n_samples=n_samples,
+        perturbation_pct=perturbation_pct,
+    )
+    valid_mask = np.ones(len(xyY_all), dtype=bool)
+    LOGGER.info("")
+    LOGGER.info("Sample statistics:")
+    LOGGER.info("  Total samples generated: %d", len(xyY_all))
+    LOGGER.info("  All samples are valid (100%% by forward conversion)")
+    LOGGER.info("")
+    LOGGER.info("Using %d valid samples for training", len(xyY_all))
+    # Split into train/validation/test (70/15/15)
+    X_temp, X_test, y_temp, y_test = train_test_split(
+        xyY_all, munsell_all, test_size=0.15, random_state=42
+    )
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_temp, y_temp, test_size=0.15 / 0.85, random_state=42
+    )
+    LOGGER.info("")
+    LOGGER.info("Data split:")
+    LOGGER.info("  Train: %d samples", len(X_train))
+    LOGGER.info("  Validation: %d samples", len(X_val))
+    LOGGER.info("  Test: %d samples", len(X_test))
+    # Save training data
+    cache_file = output_dir / f"{output}.npz"
+    np.savez_compressed(
+        cache_file,
+        X_train=X_train,
+        y_train=y_train,
+        X_val=X_val,
+        y_val=y_val,
+        X_test=X_test,
+        y_test=y_test,
+        xyY_all=xyY_all,
+        munsell_all=munsell_all,
+        valid_mask=valid_mask,
+    )
+    # Save parameters to sidecar file
+    params_file = output_dir / f"{output}_params.json"
+    params = {
+        "n_samples": n_samples,
+        "perturbation_pct": perturbation_pct,
+        "n_base_colors": len(MUNSELL_COLOURS_ALL),
+        "train_samples": len(X_train),
+        "val_samples": len(X_val),
+        "test_samples": len(X_test),
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+    }
+    with open(params_file, "w") as f:
+        json.dump(params, f, indent=2)
+    LOGGER.info("")
+    LOGGER.info("Training data saved to: %s", cache_file)
+    LOGGER.info("Parameters saved to: %s", params_file)
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate training data for xyY to Munsell conversion"
+    )
+    parser.add_argument(
+        "--n-samples",
+        type=int,
+        default=500000,
+        help="Number of samples to generate (default: 500000)",
+    )
+    parser.add_argument(
+        "--perturbation",
+        type=float,
+        default=0.05,
+        help="Perturbation as fraction of valid range (default: 0.05)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="training_data",
+        help="Output filename without extension (default: training_data)",
+    )
+    args = parser.parse_args()
+    main(
+        n_samples=args.n_samples,
+        perturbation_pct=args.perturbation,
+        output=args.output,
+    )

learning_munsell/interpolation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Interpolation-based methods for Munsell conversions."""

learning_munsell/interpolation/from_xyY/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Interpolation-based methods for xyY to Munsell conversions."""
+import numpy as np
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_ALL
+from colour.notation.munsell import munsell_colour_to_munsell_specification
+from numpy.typing import NDArray
+def load_munsell_reference_data() -> tuple[NDArray, NDArray]:
+    """
+    Load reference Munsell data from colour library.
+    Returns xyY coordinates and corresponding Munsell specifications
+    [hue, value, chroma, code] for all 4,995 reference colors.
+    The Y values are normalized to [0, 1] range (originally 0-102.57).
+    Returns
+    -------
+    Tuple[NDArray, NDArray]
+        X : xyY values of shape (4995, 3) with Y normalized to [0, 1]
+        y : Munsell specifications of shape (4995, 4)
+    """
+    xyY_list = []
+    munsell_list = []
+    for munsell_tuple, xyY in MUNSELL_COLOURS_ALL:
+        hue_name, value, chroma = munsell_tuple
+        munsell_string = f"{hue_name} {value}/{chroma}"
+        # Convert to numeric specification [hue, value, chroma, code]
+        spec = munsell_colour_to_munsell_specification(munsell_string)
+        # Normalize Y to [0, 1] range (max ~102.57)
+        xyY_normalized = np.array([xyY[0], xyY[1], xyY[2] / 100.0])
+        xyY_list.append(xyY_normalized)
+        munsell_list.append(spec)
+    return np.array(xyY_list), np.array(munsell_list)
+__all__ = ["load_munsell_reference_data"]

learning_munsell/interpolation/from_xyY/compare_methods.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+Compare classical interpolation methods against the best ML model.
+Evaluates RBF, KD-Tree, and Delaunay interpolation on REAL Munsell colors
+and compares with the Multi-Head (W+B) + Multi-Error Predictor (W+B) model.
+"""
+import logging
+import numpy as np
+import onnxruntime as ort
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_ALL
+from colour.notation.munsell import munsell_colour_to_munsell_specification
+from scipy.interpolate import LinearNDInterpolator, RBFInterpolator
+from scipy.spatial import KDTree
+from sklearn.model_selection import train_test_split
+from learning_munsell import PROJECT_ROOT
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+def load_reference_data():
+    """Load ALL Munsell colors as training data for interpolators."""
+    X, y = [], []
+    for munsell_tuple, xyY in MUNSELL_COLOURS_ALL:
+        hue_name, value, chroma = munsell_tuple
+        munsell_str = f"{hue_name} {value}/{chroma}"
+        spec = munsell_colour_to_munsell_specification(munsell_str)
+        # Normalize Y to [0, 1]
+        X.append([xyY[0], xyY[1], xyY[2] / 100.0])
+        y.append(spec)
+    return np.array(X), np.array(y)
+def evaluate(predictions, y_true, method_name):
+    """Calculate MAE for each component."""
+    errors = np.abs(predictions - y_true)
+    results = {
+        "hue": errors[:, 0].mean(),
+        "value": errors[:, 1].mean(),
+        "chroma": errors[:, 2].mean(),
+        "code": errors[:, 3].mean(),
+    }
+    LOGGER.info("  %s:", method_name)
+    for comp in ["hue", "value", "chroma", "code"]:
+        LOGGER.info("    %s MAE: %.4f", comp.capitalize(), results[comp])
+    return results
+def rbf_predict(X_train, y_train, X_test):
+    """RBF interpolation prediction."""
+    predictions = np.zeros((len(X_test), 4))
+    for i in range(4):
+        rbf = RBFInterpolator(X_train, y_train[:, i], kernel="thin_plate_spline")
+        predictions[:, i] = rbf(X_test)
+    return predictions
+def kdtree_predict(X_train, y_train, X_test, k=5):
+    """KD-Tree with inverse distance weighting prediction."""
+    tree = KDTree(X_train)
+    distances, indices = tree.query(X_test, k=k)
+    distances = np.maximum(distances, 1e-10)
+    weights = 1.0 / (distances**2)
+    weights /= weights.sum(axis=1, keepdims=True)
+    predictions = np.zeros((len(X_test), 4))
+    for i in range(len(X_test)):
+        predictions[i] = np.sum(weights[i, :, np.newaxis] * y_train[indices[i]], axis=0)
+    return predictions
+def delaunay_predict(X_train, y_train, X_test):
+    """Delaunay interpolation with NN fallback."""
+    predictions = np.zeros((len(X_test), 4))
+    tree = KDTree(X_train)
+    for i in range(4):
+        interp = LinearNDInterpolator(X_train, y_train[:, i])
+        predictions[:, i] = interp(X_test)
+    # Fallback to nearest neighbor for NaN
+    nan_mask = np.any(np.isnan(predictions), axis=1)
+    if nan_mask.sum() > 0:
+        _, indices = tree.query(X_test[nan_mask])
+        predictions[nan_mask] = y_train[indices]
+    return predictions
+def ml_predict(X_test):
+    """ML model prediction using base + error predictor."""
+    base_path = PROJECT_ROOT / "models" / "from_xyY" / "multi_head_weighted_boundary.onnx"
+    error_path = (
+        PROJECT_ROOT
+        / "models"
+        / "from_xyY"
+        / "multi_head_weighted_boundary_multi_error_predictor_weighted_boundary.onnx"
+    )
+    if not base_path.exists() or not error_path.exists():
+        return None
+    # Input is already normalized to [0, 1] for x, y, Y
+    X_norm = X_test.astype(np.float32)
+    # Base model prediction
+    base_session = ort.InferenceSession(str(base_path))
+    base_out = base_session.run(None, {"xyY": X_norm})[0]
+    # Error predictor (takes xyY + base predictions)
+    error_session = ort.InferenceSession(str(error_path))
+    combined_input = np.concatenate([X_norm, base_out], axis=1).astype(np.float32)
+    error_out = error_session.run(None, {"combined_input": combined_input})[0]
+    # Combined prediction (normalized)
+    pred_norm = base_out + error_out
+    # Denormalize using actual ranges from params file
+    predictions = np.zeros_like(pred_norm)
+    predictions[:, 0] = pred_norm[:, 0] * (10.0 - 0.5) + 0.5  # Hue: [0.5, 10]
+    predictions[:, 1] = pred_norm[:, 1] * (10.0 - 0.0) + 0.0  # Value: [0, 10]
+    predictions[:, 2] = pred_norm[:, 2] * (50.0 - 0.0) + 0.0  # Chroma: [0, 50]
+    predictions[:, 3] = pred_norm[:, 3] * (10.0 - 1.0) + 1.0  # Code: [1, 10]
+    return predictions
+def main():
+    """Compare all methods using held-out test set."""
+    LOGGER.info("=" * 80)
+    LOGGER.info("Classical Interpolation vs ML Model Comparison")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Loading data...")
+    X_all, y_all = load_reference_data()
+    # 80/20 train/test split for fair comparison
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_all, y_all, test_size=0.2, random_state=42
+    )
+    LOGGER.info("  Total: %d colors", len(X_all))
+    LOGGER.info("  Training: %d colors (80%%)", len(X_train))
+    LOGGER.info("  Test: %d colors (20%%)", len(X_test))
+    results = {}
+    # RBF
+    LOGGER.info("")
+    LOGGER.info("-" * 60)
+    LOGGER.info("RBF Interpolation (thin_plate_spline)")
+    rbf_pred = rbf_predict(X_train, y_train, X_test)
+    results["RBF"] = evaluate(rbf_pred, y_test, "RBF")
+    # KD-Tree
+    LOGGER.info("")
+    LOGGER.info("-" * 60)
+    LOGGER.info("KD-Tree Interpolation (k=5, IDW)")
+    kdt_pred = kdtree_predict(X_train, y_train, X_test, k=5)
+    results["KD-Tree"] = evaluate(kdt_pred, y_test, "KD-Tree")
+    # Delaunay
+    LOGGER.info("")
+    LOGGER.info("-" * 60)
+    LOGGER.info("Delaunay Interpolation (with NN fallback)")
+    del_pred = delaunay_predict(X_train, y_train, X_test)
+    results["Delaunay"] = evaluate(del_pred, y_test, "Delaunay")
+    # ML
+    LOGGER.info("")
+    LOGGER.info("-" * 60)
+    LOGGER.info("ML Model (Multi-Head W+B + Multi-Error Predictor W+B)")
+    ml_pred = ml_predict(X_test)
+    if ml_pred is not None:
+        results["ML"] = evaluate(ml_pred, y_test, "ML")
+    else:
+        LOGGER.info("  Skipped (model not found)")
+    # Summary
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("SUMMARY (MAE on %d held-out test colors)", len(X_test))
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("%-12s  %8s  %8s  %8s  %8s", "Method", "Hue", "Value", "Chroma", "Code")
+    LOGGER.info("-" * 52)
+    for method, mae in results.items():
+        LOGGER.info(
+            "%-12s  %8.4f  %8.4f  %8.4f  %8.4f",
+            method,
+            mae["hue"],
+            mae["value"],
+            mae["chroma"],
+            mae["code"],
+        )
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    main()

learning_munsell/interpolation/from_xyY/delaunay_interpolator.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+Delaunay triangulation based interpolation for xyY to Munsell conversion.
+This approach uses scipy's LinearNDInterpolator which performs piecewise
+linear interpolation based on Delaunay triangulation.
+Uses the 4,995 reference colors from MUNSELL_COLOURS_ALL directly.
+Advantages:
+- Piecewise linear: exact at data points, linear between
+- Handles irregular point distributions
+- No hyperparameters to tune
+Disadvantages:
+- Returns NaN outside convex hull of data points
+- Non-convex Munsell boundary may cause issues
+- C0 continuous only (discontinuous gradients at cell boundaries)
+"""
+import logging
+import pickle
+from pathlib import Path
+import numpy as np
+from numpy.typing import NDArray
+from scipy.interpolate import LinearNDInterpolator
+from scipy.spatial import KDTree
+from sklearn.model_selection import train_test_split
+from learning_munsell import PROJECT_ROOT, setup_logging
+from learning_munsell.interpolation.from_xyY import load_munsell_reference_data
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+class MunsellDelaunayInterpolator:
+    """
+    Delaunay triangulation based interpolator for xyY to Munsell conversion.
+    Uses LinearNDInterpolator for piecewise linear interpolation within
+    the Delaunay triangulation. Falls back to nearest neighbor for points
+    outside the convex hull.
+    """
+    def __init__(self, fallback_to_nearest: bool = True) -> None:
+        """
+        Initialize the Delaunay interpolator.
+        Parameters
+        ----------
+        fallback_to_nearest
+            If True, use nearest neighbor for points outside convex hull.
+            If False, return NaN for such points.
+        """
+        self.fallback_to_nearest = fallback_to_nearest
+        self.interpolators: dict = {}
+        self.kdtree: KDTree | None = None
+        self.y_data: NDArray | None = None
+        self.fitted = False
+    def fit(self, X: NDArray, y: NDArray) -> "MunsellDelaunayInterpolator":
+        """
+        Build the Delaunay interpolator from training data.
+        Parameters
+        ----------
+        X
+            xyY input values of shape (n, 3)
+        y
+            Munsell output values [hue, value, chroma, code] of shape (n, 4)
+        Returns
+        -------
+        self
+        """
+        LOGGER.info("Building Delaunay interpolator...")
+        LOGGER.info("  Fallback to nearest: %s", self.fallback_to_nearest)
+        LOGGER.info("  Data points: %d", len(X))
+        component_names = ["hue", "value", "chroma", "code"]
+        for i, name in enumerate(component_names):
+            LOGGER.info("  Building %s interpolator...", name)
+            self.interpolators[name] = LinearNDInterpolator(X, y[:, i])
+        # Build KDTree for nearest neighbor fallback
+        if self.fallback_to_nearest:
+            LOGGER.info("  Building KD-Tree for fallback...")
+            self.kdtree = KDTree(X)
+            self.y_data = y.copy()
+        self.fitted = True
+        LOGGER.info("Delaunay interpolator built successfully")
+        return self
+    def predict(self, X: NDArray) -> NDArray:
+        """
+        Predict Munsell values using Delaunay interpolation.
+        Parameters
+        ----------
+        X
+            xyY input values of shape (n, 3)
+        Returns
+        -------
+        NDArray
+            Predicted Munsell values [hue, value, chroma, code] of shape (n, 4)
+        """
+        if not self.fitted:
+            msg = "Interpolator not fitted. Call fit() first."
+            raise RuntimeError(msg)
+        results = np.zeros((len(X), 4))
+        for i, name in enumerate(["hue", "value", "chroma", "code"]):
+            results[:, i] = self.interpolators[name](X)
+        # Handle NaN values (points outside convex hull)
+        if self.fallback_to_nearest:
+            nan_mask = np.any(np.isnan(results), axis=1)
+            n_nan = nan_mask.sum()
+            if n_nan > 0:
+                LOGGER.debug("  %d points outside hull, using nearest neighbor", n_nan)
+                # Find nearest neighbors for NaN points
+                _, indices = self.kdtree.query(X[nan_mask])
+                results[nan_mask] = self.y_data[indices]
+        return results
+    def save(self, path: Path) -> None:
+        """Save the interpolator to disk."""
+        with open(path, "wb") as f:
+            pickle.dump(
+                {
+                    "fallback_to_nearest": self.fallback_to_nearest,
+                    "interpolators": self.interpolators,
+                    "kdtree": self.kdtree,
+                    "y_data": self.y_data,
+                },
+                f,
+            )
+        LOGGER.info("Saved Delaunay interpolator to %s", path)
+    @classmethod
+    def load(cls, path: Path) -> "MunsellDelaunayInterpolator":
+        """Load the interpolator from disk."""
+        with open(path, "rb") as f:
+            data = pickle.load(f)  # noqa: S301
+        instance = cls(fallback_to_nearest=data["fallback_to_nearest"])
+        instance.interpolators = data["interpolators"]
+        instance.kdtree = data["kdtree"]
+        instance.y_data = data["y_data"]
+        instance.fitted = True
+        LOGGER.info("Loaded Delaunay interpolator from %s", path)
+        return instance
+def evaluate_delaunay(
+    interpolator: MunsellDelaunayInterpolator,
+    X: NDArray,
+    y: NDArray,
+    name: str = "Test",
+) -> dict:
+    """Evaluate Delaunay interpolator performance."""
+    predictions = interpolator.predict(X)
+    # Check for NaN values
+    nan_count = np.isnan(predictions).any(axis=1).sum()
+    if nan_count > 0:
+        LOGGER.warning("  %d/%d predictions contain NaN", nan_count, len(X))
+    # Filter out NaN for error calculation
+    valid_mask = ~np.isnan(predictions).any(axis=1)
+    if valid_mask.sum() == 0:
+        LOGGER.error("  All predictions are NaN!")
+        return {
+            "hue": float("nan"),
+            "value": float("nan"),
+            "chroma": float("nan"),
+            "code": float("nan"),
+        }
+    errors = np.abs(predictions[valid_mask] - y[valid_mask])
+    component_names = ["Hue", "Value", "Chroma", "Code"]
+    results = {}
+    LOGGER.info("%s set MAE (%d/%d valid):", name, valid_mask.sum(), len(X))
+    for i, comp_name in enumerate(component_names):
+        mae = errors[:, i].mean()
+        results[comp_name.lower()] = mae
+        LOGGER.info("  %s: %.4f", comp_name, mae)
+    return results
+def main() -> None:
+    """Build and evaluate Delaunay interpolator using reference Munsell data."""
+    log_file = setup_logging("delaunay_interpolator", "from_xyY")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Delaunay Interpolation for xyY to Munsell Conversion")
+    LOGGER.info("Using MUNSELL_COLOURS_ALL reference data (4,995 colors)")
+    LOGGER.info("=" * 80)
+    # Load reference data from colour library
+    LOGGER.info("")
+    LOGGER.info("Loading reference Munsell data...")
+    X_all, y_all = load_munsell_reference_data()
+    LOGGER.info("Total reference colors: %d", len(X_all))
+    # Split into train/validation (80/20)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_all, y_all, test_size=0.2, random_state=42
+    )
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Test with and without fallback
+    LOGGER.info("")
+    LOGGER.info("Testing Delaunay interpolation...")
+    LOGGER.info("-" * 60)
+    best_config = None
+    best_mae = float("inf")
+    for fallback in [True, False]:
+        LOGGER.info("")
+        LOGGER.info("Fallback to nearest: %s", fallback)
+        interpolator = MunsellDelaunayInterpolator(fallback_to_nearest=fallback)
+        interpolator.fit(X_train, y_train)
+        results = evaluate_delaunay(interpolator, X_val, y_val, "Validation")
+        # Skip if results contain NaN
+        if any(np.isnan(v) for v in results.values()):
+            LOGGER.info("  Skipping due to NaN results")
+            continue
+        total_mae = sum(results.values())
+        if total_mae < best_mae:
+            best_mae = total_mae
+            best_config = fallback
+    LOGGER.info("")
+    LOGGER.info("=" * 60)
+    LOGGER.info("Best configuration: fallback_to_nearest=%s", best_config)
+    LOGGER.info("=" * 60)
+    # Train final model on ALL data
+    LOGGER.info("")
+    LOGGER.info("Training final model on all %d reference colors...", len(X_all))
+    final_interpolator = MunsellDelaunayInterpolator(fallback_to_nearest=best_config)
+    final_interpolator.fit(X_all, y_all)
+    LOGGER.info("")
+    LOGGER.info("Final evaluation (training set = all data):")
+    evaluate_delaunay(final_interpolator, X_all, y_all, "All data")
+    # Save the model
+    model_dir = PROJECT_ROOT / "models" / "from_xyY"
+    model_dir.mkdir(parents=True, exist_ok=True)
+    model_path = model_dir / "delaunay_interpolator.pkl"
+    final_interpolator.save(model_path)
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    log_file.close()
+if __name__ == "__main__":
+    main()

learning_munsell/interpolation/from_xyY/kdtree_interpolator.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+KD-Tree based interpolation for xyY to Munsell conversion.
+This approach uses scipy's KDTree for fast nearest neighbor lookups,
+with optional weighted interpolation using k nearest neighbors.
+Uses the 4,995 reference colors from MUNSELL_COLOURS_ALL directly.
+Advantages over RBF:
+- O(n) memory, O(log n) query time
+- Scales to millions of data points
+- No matrix inversion required
+Advantages over ML:
+- Deterministic
+- No training required
+- Easy to understand
+"""
+import logging
+import pickle
+from pathlib import Path
+import numpy as np
+from numpy.typing import NDArray
+from scipy.spatial import KDTree
+from sklearn.model_selection import train_test_split
+from learning_munsell import PROJECT_ROOT, setup_logging
+from learning_munsell.interpolation.from_xyY import load_munsell_reference_data
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+class MunsellKDTreeInterpolator:
+    """
+    KD-Tree based interpolator for xyY to Munsell conversion.
+    Uses k-nearest neighbors with inverse distance weighting
+    for smooth interpolation.
+    """
+    def __init__(self, k: int = 5, power: float = 2.0) -> None:
+        """
+        Initialize the KD-Tree interpolator.
+        Parameters
+        ----------
+        k
+            Number of nearest neighbors to use for interpolation.
+        power
+            Power for inverse distance weighting. Higher = sharper.
+        """
+        self.k = k
+        self.power = power
+        self.tree: KDTree | None = None
+        self.y_data: NDArray | None = None
+        self.fitted = False
+    def fit(self, X: NDArray, y: NDArray) -> "MunsellKDTreeInterpolator":
+        """
+        Build the KD-Tree from training data.
+        Parameters
+        ----------
+        X
+            xyY input values of shape (n, 3)
+        y
+            Munsell output values [hue, value, chroma, code] of shape (n, 4)
+        Returns
+        -------
+        self
+        """
+        LOGGER.info("Building KD-Tree interpolator...")
+        LOGGER.info("  k neighbors: %d", self.k)
+        LOGGER.info("  IDW power: %.1f", self.power)
+        LOGGER.info("  Data points: %d", len(X))
+        self.tree = KDTree(X)
+        self.y_data = y.copy()
+        self.fitted = True
+        LOGGER.info("KD-Tree built successfully")
+        return self
+    def predict(self, X: NDArray) -> NDArray:
+        """
+        Predict Munsell values using k-NN with IDW.
+        Parameters
+        ----------
+        X
+            xyY input values of shape (n, 3)
+        Returns
+        -------
+        NDArray
+            Predicted Munsell values [hue, value, chroma, code] of shape (n, 4)
+        """
+        if not self.fitted:
+            msg = "Interpolator not fitted. Call fit() first."
+            raise RuntimeError(msg)
+        # Query k nearest neighbors
+        distances, indices = self.tree.query(X, k=self.k)
+        # Ensure 2D arrays for consistent handling
+        if self.k == 1:
+            distances = distances.reshape(-1, 1)
+            indices = indices.reshape(-1, 1)
+        # Inverse distance weighting
+        # Avoid division by zero
+        distances = np.maximum(distances, 1e-10)
+        weights = 1.0 / (distances**self.power)
+        weights /= weights.sum(axis=1, keepdims=True)
+        # Weighted average of neighbor values
+        results = np.zeros((len(X), 4))
+        for i in range(len(X)):
+            neighbor_values = self.y_data[indices[i]]
+            if self.k == 1:
+                results[i] = neighbor_values.flatten()
+            else:
+                results[i] = np.sum(weights[i, :, np.newaxis] * neighbor_values, axis=0)
+        return results
+    def save(self, path: Path) -> None:
+        """Save the interpolator to disk."""
+        with open(path, "wb") as f:
+            pickle.dump(
+                {
+                    "k": self.k,
+                    "power": self.power,
+                    "tree": self.tree,
+                    "y_data": self.y_data,
+                },
+                f,
+            )
+        LOGGER.info("Saved KD-Tree interpolator to %s", path)
+    @classmethod
+    def load(cls, path: Path) -> "MunsellKDTreeInterpolator":
+        """Load the interpolator from disk."""
+        with open(path, "rb") as f:
+            data = pickle.load(f)  # noqa: S301
+        instance = cls(k=data["k"], power=data["power"])
+        instance.tree = data["tree"]
+        instance.y_data = data["y_data"]
+        instance.fitted = True
+        LOGGER.info("Loaded KD-Tree interpolator from %s", path)
+        return instance
+def evaluate_kdtree(
+    interpolator: MunsellKDTreeInterpolator,
+    X: NDArray,
+    y: NDArray,
+    name: str = "Test",
+) -> dict:
+    """Evaluate KD-Tree interpolator performance."""
+    predictions = interpolator.predict(X)
+    errors = np.abs(predictions - y)
+    component_names = ["Hue", "Value", "Chroma", "Code"]
+    results = {}
+    LOGGER.info("%s set MAE:", name)
+    for i, comp_name in enumerate(component_names):
+        mae = errors[:, i].mean()
+        results[comp_name.lower()] = mae
+        LOGGER.info("  %s: %.4f", comp_name, mae)
+    return results
+def main() -> None:
+    """Build and evaluate KD-Tree interpolator using reference Munsell data."""
+    log_file = setup_logging("kdtree_interpolator", "from_xyY")
+    LOGGER.info("=" * 80)
+    LOGGER.info("KD-Tree Interpolation for xyY to Munsell Conversion")
+    LOGGER.info("Using MUNSELL_COLOURS_ALL reference data (4,995 colors)")
+    LOGGER.info("=" * 80)
+    # Load reference data from colour library
+    LOGGER.info("")
+    LOGGER.info("Loading reference Munsell data...")
+    X_all, y_all = load_munsell_reference_data()
+    LOGGER.info("Total reference colors: %d", len(X_all))
+    # Split into train/validation (80/20)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_all, y_all, test_size=0.2, random_state=42
+    )
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Test different k values
+    k_values = [1, 3, 5, 10, 20, 50]
+    best_k = None
+    best_mae = float("inf")
+    LOGGER.info("")
+    LOGGER.info("Testing different k values...")
+    LOGGER.info("-" * 60)
+    for k in k_values:
+        LOGGER.info("")
+        LOGGER.info("k = %d:", k)
+        interpolator = MunsellKDTreeInterpolator(k=k, power=2.0)
+        interpolator.fit(X_train, y_train)
+        results = evaluate_kdtree(interpolator, X_val, y_val, "Validation")
+        total_mae = sum(results.values())
+        if total_mae < best_mae:
+            best_mae = total_mae
+            best_k = k
+    LOGGER.info("")
+    LOGGER.info("=" * 60)
+    LOGGER.info("Best k: %d", best_k)
+    LOGGER.info("=" * 60)
+    # Train final model with best k on ALL data
+    LOGGER.info("")
+    LOGGER.info(
+        "Training final model on all %d reference colors with k=%d...",
+        len(X_all),
+        best_k,
+    )
+    final_interpolator = MunsellKDTreeInterpolator(k=best_k, power=2.0)
+    final_interpolator.fit(X_all, y_all)
+    LOGGER.info("")
+    LOGGER.info("Final evaluation (training set = all data):")
+    evaluate_kdtree(final_interpolator, X_all, y_all, "All data")
+    # Save the model
+    model_dir = PROJECT_ROOT / "models" / "from_xyY"
+    model_dir.mkdir(parents=True, exist_ok=True)
+    model_path = model_dir / "kdtree_interpolator.pkl"
+    final_interpolator.save(model_path)
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    log_file.close()
+if __name__ == "__main__":
+    main()

learning_munsell/interpolation/from_xyY/rbf_interpolator.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+RBF (Radial Basis Function) interpolation for xyY to Munsell conversion.
+This approach uses scipy's RBFInterpolator to build a lookup table
+with smooth interpolation between known color samples.
+Uses the 4,995 reference colors from MUNSELL_COLOURS_ALL directly.
+Advantages over ML:
+- Deterministic, no training required
+- Exact interpolation at known points
+- Smooth interpolation between points
+- Easy to understand and debug
+Disadvantages:
+- Memory scales with number of data points
+- Query time scales with data points (O(n) naive, can optimize)
+- May struggle with extrapolation
+"""
+import logging
+import pickle
+from pathlib import Path
+import numpy as np
+from numpy.typing import NDArray
+from scipy.interpolate import RBFInterpolator
+from sklearn.model_selection import train_test_split
+from learning_munsell import PROJECT_ROOT, setup_logging
+from learning_munsell.interpolation.from_xyY import load_munsell_reference_data
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+LOGGER = logging.getLogger(__name__)
+class MunsellRBFInterpolator:
+    """
+    RBF-based interpolator for xyY to Munsell conversion.
+    Uses separate RBF interpolators for each Munsell component
+    (hue, value, chroma, code) to allow independent kernel tuning.
+    """
+    def __init__(
+        self,
+        kernel: str = "thin_plate_spline",
+        smoothing: float = 0.0,
+        epsilon: float | None = None,
+    ) -> None:
+        """
+        Initialize the RBF interpolator.
+        Parameters
+        ----------
+        kernel
+            RBF kernel type. Options: 'linear', 'thin_plate_spline',
+            'cubic', 'quintic', 'multiquadric', 'inverse_multiquadric',
+            'inverse_quadratic', 'gaussian'
+        smoothing
+            Smoothing parameter. 0 = exact interpolation.
+        epsilon
+            Shape parameter for kernels that use it.
+        """
+        self.kernel = kernel
+        self.smoothing = smoothing
+        self.epsilon = epsilon
+        self.interpolators: dict[str, RBFInterpolator] = {}
+        self.fitted = False
+    def fit(self, X: NDArray, y: NDArray) -> "MunsellRBFInterpolator":
+        """
+        Fit RBF interpolators to the training data.
+        Parameters
+        ----------
+        X
+            xyY input values of shape (n, 3)
+        y
+            Munsell output values [hue, value, chroma, code] of shape (n, 4)
+        Returns
+        -------
+        self
+        """
+        LOGGER.info("Fitting RBF interpolators...")
+        LOGGER.info("  Kernel: %s", self.kernel)
+        LOGGER.info("  Smoothing: %s", self.smoothing)
+        LOGGER.info("  Data points: %d", len(X))
+        component_names = ["hue", "value", "chroma", "code"]
+        for i, name in enumerate(component_names):
+            LOGGER.info("  Building %s interpolator...", name)
+            kwargs = {
+                "kernel": self.kernel,
+                "smoothing": self.smoothing,
+            }
+            if self.epsilon is not None:
+                kwargs["epsilon"] = self.epsilon
+            self.interpolators[name] = RBFInterpolator(X, y[:, i], **kwargs)
+        self.fitted = True
+        LOGGER.info("RBF interpolators fitted successfully")
+        return self
+    def predict(self, X: NDArray) -> NDArray:
+        """
+        Predict Munsell values for given xyY inputs.
+        Parameters
+        ----------
+        X
+            xyY input values of shape (n, 3)
+        Returns
+        -------
+        NDArray
+            Predicted Munsell values [hue, value, chroma, code] of shape (n, 4)
+        """
+        if not self.fitted:
+            msg = "Interpolator not fitted. Call fit() first."
+            raise RuntimeError(msg)
+        results = np.zeros((len(X), 4))
+        for i, name in enumerate(["hue", "value", "chroma", "code"]):
+            results[:, i] = self.interpolators[name](X)
+        return results
+    def save(self, path: Path) -> None:
+        """Save the interpolator to disk."""
+        with open(path, "wb") as f:
+            pickle.dump(
+                {
+                    "kernel": self.kernel,
+                    "smoothing": self.smoothing,
+                    "epsilon": self.epsilon,
+                    "interpolators": self.interpolators,
+                },
+                f,
+            )
+        LOGGER.info("Saved RBF interpolator to %s", path)
+    @classmethod
+    def load(cls, path: Path) -> "MunsellRBFInterpolator":
+        """Load the interpolator from disk."""
+        with open(path, "rb") as f:
+            data = pickle.load(f)  # noqa: S301
+        instance = cls(
+            kernel=data["kernel"],
+            smoothing=data["smoothing"],
+            epsilon=data["epsilon"],
+        )
+        instance.interpolators = data["interpolators"]
+        instance.fitted = True
+        LOGGER.info("Loaded RBF interpolator from %s", path)
+        return instance
+def evaluate_rbf(
+    interpolator: MunsellRBFInterpolator,
+    X: NDArray,
+    y: NDArray,
+    name: str = "Test",
+) -> dict[str, float]:
+    """
+    Evaluate RBF interpolator performance.
+    Parameters
+    ----------
+    interpolator
+        Fitted RBF interpolator
+    X
+        Input xyY values
+    y
+        Ground truth Munsell values
+    name
+        Name for logging
+    Returns
+    -------
+    dict
+        Dictionary of MAE values for each component
+    """
+    predictions = interpolator.predict(X)
+    errors = np.abs(predictions - y)
+    component_names = ["Hue", "Value", "Chroma", "Code"]
+    results = {}
+    LOGGER.info("%s set MAE:", name)
+    for i, comp_name in enumerate(component_names):
+        mae = errors[:, i].mean()
+        results[comp_name.lower()] = mae
+        LOGGER.info("  %s: %.4f", comp_name, mae)
+    return results
+def main() -> None:
+    """Build and evaluate RBF interpolator using reference Munsell data."""
+    log_file = setup_logging("rbf_interpolator", "from_xyY")
+    LOGGER.info("=" * 80)
+    LOGGER.info("RBF Interpolation for xyY to Munsell Conversion")
+    LOGGER.info("Using MUNSELL_COLOURS_ALL reference data (4,995 colors)")
+    LOGGER.info("=" * 80)
+    # Load reference data from colour library
+    LOGGER.info("")
+    LOGGER.info("Loading reference Munsell data...")
+    X_all, y_all = load_munsell_reference_data()
+    LOGGER.info("Total reference colors: %d", len(X_all))
+    # Split into train/validation (80/20)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_all, y_all, test_size=0.2, random_state=42
+    )
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Test different kernels
+    kernels_to_test = [
+        ("thin_plate_spline", 0.0),
+        ("thin_plate_spline", 0.001),
+        ("thin_plate_spline", 0.01),
+        ("cubic", 0.0),
+        ("linear", 0.0),
+        ("multiquadric", 0.0),
+    ]
+    best_kernel = None
+    best_smoothing = None
+    best_mae = float("inf")
+    LOGGER.info("")
+    LOGGER.info("Testing different RBF kernels...")
+    LOGGER.info("-" * 60)
+    for kernel, smoothing in kernels_to_test:
+        LOGGER.info("")
+        LOGGER.info("Kernel: %s, Smoothing: %s", kernel, smoothing)
+        try:
+            interpolator = MunsellRBFInterpolator(kernel=kernel, smoothing=smoothing)
+            interpolator.fit(X_train, y_train)
+            results = evaluate_rbf(interpolator, X_val, y_val, "Validation")
+            total_mae = sum(results.values())
+            if total_mae < best_mae:
+                best_mae = total_mae
+                best_kernel = kernel
+                best_smoothing = smoothing
+        except Exception:
+            LOGGER.exception("  Failed")
+    LOGGER.info("")
+    LOGGER.info("=" * 60)
+    LOGGER.info("Best configuration: %s with smoothing=%s", best_kernel, best_smoothing)
+    LOGGER.info("=" * 60)
+    # Train final model with best kernel on ALL data
+    LOGGER.info("")
+    LOGGER.info("Training final model on all %d reference colors...", len(X_all))
+    final_interpolator = MunsellRBFInterpolator(
+        kernel=best_kernel, smoothing=best_smoothing
+    )
+    final_interpolator.fit(X_all, y_all)
+    LOGGER.info("")
+    LOGGER.info("Final evaluation (training set = all data):")
+    evaluate_rbf(final_interpolator, X_all, y_all, "All data")
+    # Save the model
+    model_dir = PROJECT_ROOT / "models" / "from_xyY"
+    model_dir.mkdir(parents=True, exist_ok=True)
+    model_path = model_dir / "rbf_interpolator.pkl"
+    final_interpolator.save(model_path)
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    log_file.close()
+if __name__ == "__main__":
+    main()

learning_munsell/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Loss functions for Munsell ML training."""
+from learning_munsell.losses.jax_delta_e import (
+    XYZ_to_Lab,
+    delta_E_CIE2000,
+    delta_E_loss,
+    xyY_to_Lab,
+    xyY_to_XYZ,
+)
+__all__ = [
+    "delta_E_CIE2000",
+    "delta_E_loss",
+    "xyY_to_Lab",
+    "xyY_to_XYZ",
+    "XYZ_to_Lab",
+]

learning_munsell/losses/jax_delta_e.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""
+Differentiable Delta-E Loss Functions using JAX
+================================================
+This module provides JAX implementations of color space conversions
+and Delta-E (CIE2000) loss function for use in training.
+The key insight is that we can compute Delta-E between:
+- The input xyY (which we convert to Lab as the "target")
+- The predicted Munsell converted back to Lab
+For the Munsell -> xyY conversion, we either:
+1. Use a pre-trained neural network approximator
+2. Use differentiable interpolation on the Munsell Renotation data
+"""
+from __future__ import annotations
+import colour
+import jax
+import jax.numpy as jnp
+import numpy as np
+from jax import Array
+# D65 illuminant XYZ reference values (standard for sRGB)
+D65_XYZ = jnp.array([95.047, 100.0, 108.883])
+# Illuminant C XYZ reference values (used by Munsell system)
+ILLUMINANT_C_XYZ = jnp.array([98.074, 100.0, 118.232])
+def xyY_to_XYZ(xyY: Array, scale_Y: bool = True) -> Array:
+    """
+    Convert CIE xyY to CIE XYZ.
+    Parameters
+    ----------
+    xyY : Array
+        CIE xyY values with shape (..., 3)
+    scale_Y : bool
+        If True, scale Y from 0-1 to 0-100 range (required for Lab conversion)
+    Returns
+    -------
+    Array
+        CIE XYZ values with shape (..., 3)
+    """
+    x = xyY[..., 0]
+    y = xyY[..., 1]
+    Y = xyY[..., 2]
+    # Scale Y to 0-100 range if needed (colour library uses 0-100)
+    if scale_Y:
+        Y = Y * 100.0
+    # Avoid division by zero
+    y_safe = jnp.where(y == 0, 1e-10, y)
+    X = (x * Y) / y_safe
+    Z = ((1 - x - y) * Y) / y_safe
+    # Handle y=0 case (set X=Z=0)
+    X = jnp.where(y == 0, 0.0, X)
+    Z = jnp.where(y == 0, 0.0, Z)
+    return jnp.stack([X, Y, Z], axis=-1)
+def XYZ_to_Lab(XYZ: Array, illuminant: Array = ILLUMINANT_C_XYZ) -> Array:
+    """
+    Convert CIE XYZ to CIE Lab.
+    Parameters
+    ----------
+    XYZ : Array
+        CIE XYZ values with shape (..., 3)
+    illuminant : Array
+        Reference white XYZ values
+    Returns
+    -------
+    Array
+        CIE Lab values with shape (..., 3)
+    """
+    # Normalize by illuminant
+    XYZ_n = XYZ / illuminant
+    # CIE Lab transfer function
+    delta = 6.0 / 29.0
+    delta_cube = delta**3
+    # f(t) = t^(1/3) if t > delta^3, else t/(3*delta^2) + 4/29
+    def f(t: Array) -> Array:
+        return jnp.where(t > delta_cube, jnp.cbrt(t), t / (3 * delta**2) + 4.0 / 29.0)
+    f_X = f(XYZ_n[..., 0])
+    f_Y = f(XYZ_n[..., 1])
+    f_Z = f(XYZ_n[..., 2])
+    L = 116.0 * f_Y - 16.0
+    a = 500.0 * (f_X - f_Y)
+    b = 200.0 * (f_Y - f_Z)
+    return jnp.stack([L, a, b], axis=-1)
+def xyY_to_Lab(xyY: Array, illuminant: Array = ILLUMINANT_C_XYZ) -> Array:
+    """Convert CIE xyY directly to CIE Lab."""
+    return XYZ_to_Lab(xyY_to_XYZ(xyY), illuminant)
+def delta_E_CIE2000(Lab_1: Array, Lab_2: Array) -> Array:
+    """
+    Compute CIE 2000 Delta-E color difference.
+    This is a differentiable JAX implementation of the CIE 2000 Delta-E formula.
+    Parameters
+    ----------
+    Lab_1 : Array
+        First CIE Lab color(s) with shape (..., 3)
+    Lab_2 : Array
+        Second CIE Lab color(s) with shape (..., 3)
+    Returns
+    -------
+    Array
+        Delta-E values with shape (...)
+    """
+    L_1, a_1, b_1 = Lab_1[..., 0], Lab_1[..., 1], Lab_1[..., 2]
+    L_2, a_2, b_2 = Lab_2[..., 0], Lab_2[..., 1], Lab_2[..., 2]
+    # Chroma
+    C_1_ab = jnp.sqrt(a_1**2 + b_1**2)
+    C_2_ab = jnp.sqrt(a_2**2 + b_2**2)
+    C_bar_ab = (C_1_ab + C_2_ab) / 2
+    C_bar_ab_7 = C_bar_ab**7
+    # G factor for a' adjustment (25^7 = 6103515625.0)
+    G = 0.5 * (1 - jnp.sqrt(C_bar_ab_7 / (C_bar_ab_7 + 6103515625.0)))
+    # Adjusted a'
+    a_p_1 = (1 + G) * a_1
+    a_p_2 = (1 + G) * a_2
+    # Adjusted chroma C'
+    C_p_1 = jnp.sqrt(a_p_1**2 + b_1**2)
+    C_p_2 = jnp.sqrt(a_p_2**2 + b_2**2)
+    # Hue angle h' (in degrees)
+    h_p_1 = jnp.degrees(jnp.arctan2(b_1, a_p_1)) % 360
+    h_p_2 = jnp.degrees(jnp.arctan2(b_2, a_p_2)) % 360
+    # Handle achromatic case
+    h_p_1 = jnp.where((b_1 == 0) & (a_p_1 == 0), 0.0, h_p_1)
+    h_p_2 = jnp.where((b_2 == 0) & (a_p_2 == 0), 0.0, h_p_2)
+    # Delta L', C'
+    delta_L_p = L_2 - L_1
+    delta_C_p = C_p_2 - C_p_1
+    # Delta h'
+    h_p_diff = h_p_2 - h_p_1
+    C_p_product = C_p_1 * C_p_2
+    delta_h_p = jnp.where(
+        C_p_product == 0,
+        0.0,
+        jnp.where(
+            jnp.abs(h_p_diff) <= 180,
+            h_p_diff,
+            jnp.where(h_p_diff > 180, h_p_diff - 360, h_p_diff + 360),
+        ),
+    )
+    # Delta H'
+    delta_H_p = 2 * jnp.sqrt(C_p_product) * jnp.sin(jnp.radians(delta_h_p / 2))
+    # Mean L', C'
+    L_bar_p = (L_1 + L_2) / 2
+    C_bar_p = (C_p_1 + C_p_2) / 2
+    # Mean h'
+    h_p_sum = h_p_1 + h_p_2
+    h_p_abs_diff = jnp.abs(h_p_1 - h_p_2)
+    h_bar_p = jnp.where(
+        C_p_product == 0,
+        h_p_sum,
+        jnp.where(
+            h_p_abs_diff <= 180,
+            h_p_sum / 2,
+            jnp.where(h_p_sum < 360, (h_p_sum + 360) / 2, (h_p_sum - 360) / 2),
+        ),
+    )
+    # T factor
+    T = (
+        1
+        - 0.17 * jnp.cos(jnp.radians(h_bar_p - 30))
+        + 0.24 * jnp.cos(jnp.radians(2 * h_bar_p))
+        + 0.32 * jnp.cos(jnp.radians(3 * h_bar_p + 6))
+        - 0.20 * jnp.cos(jnp.radians(4 * h_bar_p - 63))
+    )
+    # Delta theta
+    delta_theta = 30 * jnp.exp(-(((h_bar_p - 275) / 25) ** 2))
+    # R_C (25^7 = 6103515625.0)
+    C_bar_p_7 = C_bar_p**7
+    R_C = 2 * jnp.sqrt(C_bar_p_7 / (C_bar_p_7 + 6103515625.0))
+    # S_L, S_C, S_H
+    L_bar_p_minus_50_sq = (L_bar_p - 50) ** 2
+    S_L = 1 + (0.015 * L_bar_p_minus_50_sq) / jnp.sqrt(20 + L_bar_p_minus_50_sq)
+    S_C = 1 + 0.045 * C_bar_p
+    S_H = 1 + 0.015 * C_bar_p * T
+    # R_T
+    R_T = -jnp.sin(jnp.radians(2 * delta_theta)) * R_C
+    # Final Delta E
+    k_L, k_C, k_H = 1.0, 1.0, 1.0
+    term_L = delta_L_p / (k_L * S_L)
+    term_C = delta_C_p / (k_C * S_C)
+    term_H = delta_H_p / (k_H * S_H)
+    return jnp.sqrt(term_L**2 + term_C**2 + term_H**2 + R_T * term_C * term_H)
+def delta_E_loss(pred_xyY: Array, target_xyY: Array) -> Array:
+    """
+    Compute mean Delta-E loss between predicted and target xyY values.
+    This is the primary loss function for training with perceptual accuracy.
+    Parameters
+    ----------
+    pred_xyY : Array
+        Predicted xyY values with shape (batch, 3)
+    target_xyY : Array
+        Target xyY values with shape (batch, 3)
+    Returns
+    -------
+    Array
+        Scalar mean Delta-E loss
+    """
+    pred_Lab = xyY_to_Lab(pred_xyY)
+    target_Lab = xyY_to_Lab(target_xyY)
+    return jnp.mean(delta_E_CIE2000(pred_Lab, target_Lab))
+# JIT-compiled versions for performance
+xyY_to_XYZ_jit = jax.jit(xyY_to_XYZ)
+XYZ_to_Lab_jit = jax.jit(XYZ_to_Lab)
+xyY_to_Lab_jit = jax.jit(xyY_to_Lab)
+delta_E_CIE2000_jit = jax.jit(delta_E_CIE2000)
+delta_E_loss_jit = jax.jit(delta_E_loss)
+# Gradient functions
+grad_delta_E_loss = jax.grad(delta_E_loss)
+def test_jax_delta_e() -> None:
+    """Test the JAX Delta-E implementation against colour library."""
+    # Test xyY values
+    xyY_1 = np.array([0.3127, 0.3290, 0.5])  # D65 white point, Y=0.5
+    xyY_2 = np.array([0.35, 0.35, 0.5])  # Slightly shifted
+    # Convert using JAX
+    Lab_1_jax = xyY_to_Lab(jnp.array(xyY_1))
+    Lab_2_jax = xyY_to_Lab(jnp.array(xyY_2))
+    delta_E_CIE2000(Lab_1_jax, Lab_2_jax)
+    # Convert using colour library
+    XYZ_1 = colour.xyY_to_XYZ(xyY_1)
+    XYZ_2 = colour.xyY_to_XYZ(xyY_2)
+    Lab_1_colour = colour.XYZ_to_Lab(
+        XYZ_1, colour.CCS_ILLUMINANTS["CIE 1931 2 Degree Standard Observer"]["C"]
+    )
+    Lab_2_colour = colour.XYZ_to_Lab(
+        XYZ_2, colour.CCS_ILLUMINANTS["CIE 1931 2 Degree Standard Observer"]["C"]
+    )
+    colour.delta_E(Lab_1_colour, Lab_2_colour, method="CIE 2000")
+    # Test gradient computation
+    pred_xyY = jnp.array([[0.35, 0.35, 0.5]])
+    target_xyY = jnp.array([[0.3127, 0.3290, 0.5]])
+    # Compute gradient
+    grad_fn = jax.grad(lambda x: delta_E_loss(x, target_xyY))
+    grad_fn(pred_xyY)
+if __name__ == "__main__":
+    test_jax_delta_e()

learning_munsell/models/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Neural network models for Munsell color conversions."""
+from learning_munsell.models.networks import (
+    # Building blocks
+    ResidualBlock,
+    # Component networks
+    ComponentMLP,
+    ComponentErrorPredictor,
+    # Transformer building blocks
+    FeatureTokenizer,
+    TransformerBlock,
+    # Composite models: xyY → Munsell
+    MLPToMunsell,
+    MultiHeadMLPToMunsell,
+    MultiMLPToMunsell,
+    TransformerToMunsell,
+    # Error predictors: xyY → Munsell
+    MultiHeadErrorPredictorToMunsell,
+    MultiMLPErrorPredictorToMunsell,
+    # Composite models: Munsell → xyY
+    MultiMLPToxyY,
+    # Error predictors: Munsell → xyY
+    MultiMLPErrorPredictorToxyY,
+)
+__all__ = [
+    # Building blocks
+    "ResidualBlock",
+    # Component networks (single output)
+    "ComponentMLP",
+    "ComponentErrorPredictor",
+    # Transformer building blocks
+    "FeatureTokenizer",
+    "TransformerBlock",
+    # Composite models: xyY → Munsell
+    "MLPToMunsell",
+    "MultiHeadMLPToMunsell",
+    "MultiMLPToMunsell",
+    "TransformerToMunsell",
+    # Error predictors: xyY → Munsell
+    "MultiHeadErrorPredictorToMunsell",
+    "MultiMLPErrorPredictorToMunsell",
+    # Composite models: Munsell → xyY
+    "MultiMLPToxyY",
+    # Error predictors: Munsell → xyY
+    "MultiMLPErrorPredictorToxyY",
+]

learning_munsell/models/networks.py ADDED Viewed

	@@ -0,0 +1,1294 @@

+"""
+Reusable neural network building blocks.
+Provides shared network architectures for training scripts,
+including MLP components and error predictors.
+"""
+from __future__ import annotations
+import torch
+from torch import nn, Tensor
+__all__ = [
+    # Building blocks
+    "ResidualBlock",
+    # Component networks (single output)
+    "ComponentMLP",
+    "ComponentResNet",
+    "ComponentErrorPredictor",
+    # Transformer building blocks
+    "FeatureTokenizer",
+    "TransformerBlock",
+    # Composite models: xyY → Munsell
+    "MLPToMunsell",
+    "MultiHeadMLPToMunsell",
+    "MultiMLPToMunsell",
+    "MultiResNetToMunsell",
+    "TransformerToMunsell",
+    # Error predictors: xyY → Munsell
+    "MultiHeadErrorPredictorToMunsell",
+    "MultiMLPErrorPredictorToMunsell",
+    "MultiResNetErrorPredictorToMunsell",
+    # Composite models: Munsell → xyY
+    "MultiMLPToxyY",
+    # Error predictors: Munsell → xyY
+    "MultiMLPErrorPredictorToxyY",
+]
+# =============================================================================
+# Building Blocks
+# =============================================================================
+class ResidualBlock(nn.Module):
+    """
+    Residual block with GELU activation and batch normalization.
+    Architecture: input → Linear → GELU → BatchNorm → Linear → BatchNorm → add input → GELU
+    Parameters
+    ----------
+    dim : int
+        Dimension of input and output features.
+    Attributes
+    ----------
+    block : nn.Sequential
+        Sequential block with linear layers, GELU, and BatchNorm.
+    activation : nn.GELU
+        Final activation after residual addition.
+    """
+    def __init__(self, dim: int) -> None:
+        """Initialize residual block."""
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Linear(dim, dim),
+            nn.GELU(),
+            nn.BatchNorm1d(dim),
+            nn.Linear(dim, dim),
+            nn.BatchNorm1d(dim),
+        )
+        self.activation = nn.GELU()
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass with residual connection.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, dim).
+        Returns
+        -------
+        Tensor
+            Output tensor of shape (batch_size, dim).
+        """
+        return self.activation(x + self.block(x))
+# =============================================================================
+# Component Networks (Single Output)
+# =============================================================================
+class ComponentMLP(nn.Module):
+    """
+    Independent MLP for a single Munsell component.
+    Architecture: input_dim → 128 → 256 → 512 → 256 → 128 → 1
+    Parameters
+    ----------
+    input_dim : int, optional
+        Input feature dimension. Default is 3 (for xyY).
+    width_multiplier : float, optional
+        Multiplier for hidden layer dimensions. Default is 1.0.
+    dropout : float, optional
+        Dropout probability between layers. Default is 0.0.
+    Attributes
+    ----------
+    network : nn.Sequential
+        Feed-forward network with encoder-decoder structure.
+    Notes
+    -----
+    Uses ReLU activations and batch normalization. The encoder-decoder
+    architecture expands to 512-dim (or scaled by width_multiplier) and
+    then contracts back to a single output. Optional dropout can be
+    applied between layers for regularization.
+    """
+    def __init__(
+        self,
+        input_dim: int = 3,
+        width_multiplier: float = 1.0,
+        dropout: float = 0.0,
+    ) -> None:
+        """Initialize the component-specific MLP."""
+        super().__init__()
+        # Scale hidden dimensions
+        h1 = int(128 * width_multiplier)
+        h2 = int(256 * width_multiplier)
+        h3 = int(512 * width_multiplier)
+        layers: list[nn.Module] = [
+            # Encoder
+            nn.Linear(input_dim, h1),
+            nn.ReLU(),
+            nn.BatchNorm1d(h1),
+        ]
+        if dropout > 0:
+            layers.append(nn.Dropout(dropout))
+        layers.extend(
+            [
+                nn.Linear(h1, h2),
+                nn.ReLU(),
+                nn.BatchNorm1d(h2),
+            ]
+        )
+        if dropout > 0:
+            layers.append(nn.Dropout(dropout))
+        layers.extend(
+            [
+                nn.Linear(h2, h3),
+                nn.ReLU(),
+                nn.BatchNorm1d(h3),
+            ]
+        )
+        if dropout > 0:
+            layers.append(nn.Dropout(dropout))
+        layers.extend(
+            [
+                # Decoder
+                nn.Linear(h3, h2),
+                nn.ReLU(),
+                nn.BatchNorm1d(h2),
+            ]
+        )
+        if dropout > 0:
+            layers.append(nn.Dropout(dropout))
+        layers.extend(
+            [
+                nn.Linear(h2, h1),
+                nn.ReLU(),
+                nn.BatchNorm1d(h1),
+                # Output
+                nn.Linear(h1, 1),
+            ]
+        )
+        self.network = nn.Sequential(*layers)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through the component-specific network.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, input_dim).
+        Returns
+        -------
+        Tensor
+            Output tensor of shape (batch_size, 1) containing the predicted
+            component value.
+        """
+        return self.network(x)
+class ComponentResNet(nn.Module):
+    """
+    Independent ResNet for a single Munsell component with true skip connections.
+    Architecture: input → projection → ResidualBlock × num_blocks → output
+    Unlike ComponentMLP, this uses actual residual blocks where:
+        output = activation(x + f(x))
+    Parameters
+    ----------
+    input_dim : int, optional
+        Input feature dimension. Default is 3 (for xyY).
+    hidden_dim : int, optional
+        Hidden dimension for residual blocks. Default is 256.
+    num_blocks : int, optional
+        Number of residual blocks. Default is 4.
+    Attributes
+    ----------
+    input_proj : nn.Sequential
+        Projects input to hidden dimension with GELU activation.
+    res_blocks : nn.ModuleList
+        List of ResidualBlock modules with skip connections.
+    output_proj : nn.Linear
+        Projects hidden dimension to single output.
+    """
+    def __init__(
+        self,
+        input_dim: int = 3,
+        hidden_dim: int = 256,
+        num_blocks: int = 4,
+    ) -> None:
+        """Initialize the component-specific ResNet."""
+        super().__init__()
+        # Project input to hidden dimension
+        self.input_proj = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.GELU(),
+        )
+        # Stack of residual blocks with skip connections
+        self.res_blocks = nn.ModuleList(
+            [ResidualBlock(hidden_dim) for _ in range(num_blocks)]
+        )
+        # Project to output
+        self.output_proj = nn.Linear(hidden_dim, 1)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through the ResNet with skip connections.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, input_dim).
+        Returns
+        -------
+        Tensor
+            Output tensor of shape (batch_size, 1).
+        """
+        x = self.input_proj(x)
+        for block in self.res_blocks:
+            x = block(x)  # Each block applies: activation(x + f(x))
+        return self.output_proj(x)
+class ComponentErrorPredictor(nn.Module):
+    """
+    Independent error predictor for a single Munsell component.
+    A deep MLP that learns to predict residual errors for one Munsell
+    component (hue, value, chroma, or code).
+    Parameters
+    ----------
+    input_dim : int, optional
+        Input feature dimension. Default is 7 (xyY_norm + base_pred_norm).
+    width_multiplier : float, optional
+        Multiplier for hidden layer widths. Default is 1.0.
+        Use 1.5 for chroma which requires more capacity.
+    Attributes
+    ----------
+    network : nn.Sequential
+        Feed-forward network: input → 128 → 256 → 512 → 256 → 128 → 1
+        with GELU activations and BatchNorm after each hidden layer.
+    Notes
+    -----
+    Default input is [xyY_norm (3) + base_pred_norm (4)] = 7 features.
+    Output is a single scalar error correction for the component.
+    """
+    def __init__(
+        self,
+        input_dim: int = 7,
+        width_multiplier: float = 1.0,
+    ) -> None:
+        """Initialize the error predictor."""
+        super().__init__()
+        # Scale hidden dimensions
+        h1 = int(128 * width_multiplier)
+        h2 = int(256 * width_multiplier)
+        h3 = int(512 * width_multiplier)
+        self.network = nn.Sequential(
+            # Encoder
+            nn.Linear(input_dim, h1),
+            nn.GELU(),
+            nn.BatchNorm1d(h1),
+            nn.Linear(h1, h2),
+            nn.GELU(),
+            nn.BatchNorm1d(h2),
+            nn.Linear(h2, h3),
+            nn.GELU(),
+            nn.BatchNorm1d(h3),
+            # Decoder
+            nn.Linear(h3, h2),
+            nn.GELU(),
+            nn.BatchNorm1d(h2),
+            nn.Linear(h2, h1),
+            nn.GELU(),
+            nn.BatchNorm1d(h1),
+            # Output
+            nn.Linear(h1, 1),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through the error predictor.
+        Parameters
+        ----------
+        x : Tensor
+            Combined input of shape (batch_size, input_dim).
+        Returns
+        -------
+        Tensor
+            Predicted error correction of shape (batch_size, 1).
+        """
+        return self.network(x)
+# =============================================================================
+# Transformer Building Blocks
+# =============================================================================
+class FeatureTokenizer(nn.Module):
+    """
+    Tokenize each input feature into high-dimensional embedding.
+    Converts each scalar input feature into a learned embedding vector,
+    similar to word embeddings in NLP. Also prepends a learnable CLS token
+    used for regression output.
+    Parameters
+    ----------
+    num_features : int
+        Number of input features to tokenize.
+    embedding_dim : int
+        Dimensionality of each token embedding.
+    Attributes
+    ----------
+    feature_embeddings : nn.ModuleList
+        List of linear layers, one per input feature.
+    cls_token : nn.Parameter
+        Learnable classification token prepended to feature tokens.
+    """
+    def __init__(self, num_features: int, embedding_dim: int) -> None:
+        """Initialize the feature tokenizer."""
+        super().__init__()
+        # Each feature gets its own embedding
+        self.feature_embeddings = nn.ModuleList(
+            [nn.Linear(1, embedding_dim) for _ in range(num_features)]
+        )
+        # CLS token for regression
+        self.cls_token = nn.Parameter(torch.randn(1, 1, embedding_dim))
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Transform input features into token embeddings.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, num_features).
+        Returns
+        -------
+        Tensor
+            Token embeddings of shape (batch_size, 1+num_features, embedding_dim).
+            First token is CLS, followed by feature tokens.
+        """
+        batch_size = x.size(0)
+        # Tokenize each feature
+        tokens = []
+        for i, embedding in enumerate(self.feature_embeddings):
+            feature_val = x[:, i : i + 1]  # (batch_size, 1)
+            token = embedding(feature_val)  # (batch_size, embedding_dim)
+            tokens.append(token.unsqueeze(1))  # (batch_size, 1, embedding_dim)
+        # Concatenate feature tokens
+        feature_tokens = torch.cat(
+            tokens, dim=1
+        )  # (batch_size, num_features, embedding_dim)
+        # Prepend CLS token
+        cls_tokens = self.cls_token.expand(
+            batch_size, -1, -1
+        )  # (batch_size, 1, embedding_dim)
+        return torch.cat(
+            [cls_tokens, feature_tokens], dim=1
+        )  # (batch_size, 1+num_features, embedding_dim)
+class TransformerBlock(nn.Module):
+    """
+    Standard transformer block with multi-head attention and feedforward network.
+    Implements the classic transformer architecture with self-attention,
+    feedforward layers, layer normalization, and residual connections.
+    Parameters
+    ----------
+    embedding_dim : int
+        Dimension of token embeddings.
+    num_heads : int
+        Number of attention heads.
+    ff_dim : int
+        Hidden dimension of feedforward network.
+    dropout : float, optional
+        Dropout probability, default is 0.1.
+    Attributes
+    ----------
+    attention : nn.MultiheadAttention
+        Multi-head self-attention mechanism.
+    norm1 : nn.LayerNorm
+        Layer normalization after attention.
+    feedforward : nn.Sequential
+        Feedforward network with GELU activation.
+    norm2 : nn.LayerNorm
+        Layer normalization after feedforward.
+    """
+    def __init__(
+        self, embedding_dim: int, num_heads: int, ff_dim: int, dropout: float = 0.1
+    ) -> None:
+        """Initialize the transformer block."""
+        super().__init__()
+        self.attention = nn.MultiheadAttention(
+            embedding_dim, num_heads, dropout=dropout, batch_first=True
+        )
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.feedforward = nn.Sequential(
+            nn.Linear(embedding_dim, ff_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(ff_dim, embedding_dim),
+            nn.Dropout(dropout),
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Apply transformer block to input tokens.
+        Parameters
+        ----------
+        x : Tensor
+            Input tokens of shape (batch_size, num_tokens, embedding_dim).
+        Returns
+        -------
+        Tensor
+            Transformed tokens of shape (batch_size, num_tokens, embedding_dim).
+        """
+        # Self-attention with residual
+        attn_output, _ = self.attention(x, x, x)
+        x = self.norm1(x + attn_output)
+        # Feedforward with residual
+        ff_output = self.feedforward(x)
+        return self.norm2(x + ff_output)
+# =============================================================================
+# Composite Models: xyY → Munsell
+# =============================================================================
+class MLPToMunsell(nn.Module):
+    """
+    Large MLP for xyY to Munsell conversion.
+    Architecture: 3 → 128 → 256 → 512 → 512 → 256 → 128 → 4
+    Attributes
+    ----------
+    network : nn.Sequential
+        Feed-forward network with ReLU activations and BatchNorm.
+    """
+    def __init__(self) -> None:
+        """Initialize the MunsellMLP network."""
+        super().__init__()
+        self.network = nn.Sequential(
+            nn.Linear(3, 128),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 512),
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 4),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through the network.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, 3) containing normalized xyY values.
+        Returns
+        -------
+        Tensor
+            Output tensor of shape (batch_size, 4) containing normalized Munsell
+            specifications [hue, value, chroma, code].
+        """
+        return self.network(x)
+class MultiHeadMLPToMunsell(nn.Module):
+    """
+    Multi-head MLP for xyY to Munsell conversion.
+    Each component (hue, value, chroma, code) has a specialized decoder head
+    after a shared encoder. The chroma head is wider to handle the more complex
+    non-linear relationship between xyY and chroma.
+    Attributes
+    ----------
+    encoder : nn.Sequential
+        Shared encoder: 3 → 128 → 256 → 512 with ReLU and BatchNorm.
+    hue_head : nn.Sequential
+        Hue decoder: 512 → 256 → 128 → 1 (circular component).
+    value_head : nn.Sequential
+        Value decoder: 512 → 256 → 128 → 1 (linear component).
+    chroma_head : nn.Sequential
+        Chroma decoder: 512 → 384 → 256 → 128 → 1 (wider for complexity).
+    code_head : nn.Sequential
+        Code decoder: 512 → 256 → 128 → 1 (discrete component).
+    Notes
+    -----
+    The chroma head has increased capacity (384 units in first layer) to handle
+    the more complex non-linear relationship between xyY and chroma.
+    """
+    def __init__(self) -> None:
+        """Initialize the multi-head MLP model."""
+        super().__init__()
+        # Shared encoder - learns general color space features
+        self.encoder = nn.Sequential(
+            nn.Linear(3, 128),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 512),
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+        )
+        # Hue head - circular/angular component
+        self.hue_head = nn.Sequential(
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 1),
+        )
+        # Value head - linear lightness
+        self.value_head = nn.Sequential(
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 1),
+        )
+        # Chroma head - non-linear saturation (WIDER for harder task)
+        self.chroma_head = nn.Sequential(
+            nn.Linear(512, 384),  # Wider than other heads
+            nn.ReLU(),
+            nn.BatchNorm1d(384),
+            nn.Linear(384, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 1),
+        )
+        # Code head - discrete categorical
+        self.code_head = nn.Sequential(
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 1),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through the multi-head network.
+        Parameters
+        ----------
+        x : Tensor
+            Input xyY values of shape (batch_size, 3).
+        Returns
+        -------
+        Tensor
+            Concatenated Munsell predictions [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        """
+        # Shared feature extraction
+        features = self.encoder(x)
+        # Component-specific predictions
+        hue = self.hue_head(features)
+        value = self.value_head(features)
+        chroma = self.chroma_head(features)
+        code = self.code_head(features)
+        # Concatenate: [Hue, Value, Chroma, Code]
+        return torch.cat([hue, value, chroma, code], dim=1)
+class MultiMLPToMunsell(nn.Module):
+    """
+    Multi-MLP for xyY to Munsell conversion.
+    Uses 4 independent ComponentMLP branches, one for each Munsell component.
+    The chroma branch can be wider to handle the more complex relationship.
+    Parameters
+    ----------
+    chroma_width_multiplier : float, optional
+        Width multiplier for the chroma branch. Default is 2.0.
+    dropout : float, optional
+        Dropout probability for all branches. Default is 0.1.
+    Attributes
+    ----------
+    hue_branch : ComponentMLP
+        MLP for hue component (1.0x width).
+    value_branch : ComponentMLP
+        MLP for value component (1.0x width).
+    chroma_branch : ComponentMLP
+        MLP for chroma component (configurable width).
+    code_branch : ComponentMLP
+        MLP for hue code component (1.0x width).
+    """
+    def __init__(
+        self, chroma_width_multiplier: float = 2.0, dropout: float = 0.1
+    ) -> None:
+        """Initialize the multi-branch MLP model."""
+        super().__init__()
+        self.hue_branch = ComponentMLP(
+            input_dim=3, width_multiplier=1.0, dropout=dropout
+        )
+        self.value_branch = ComponentMLP(
+            input_dim=3, width_multiplier=1.0, dropout=dropout
+        )
+        self.chroma_branch = ComponentMLP(
+            input_dim=3, width_multiplier=chroma_width_multiplier, dropout=dropout
+        )
+        self.code_branch = ComponentMLP(
+            input_dim=3, width_multiplier=1.0, dropout=dropout
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through all 4 independent branches.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, 3) containing normalized xyY values.
+        Returns
+        -------
+        Tensor
+            Concatenated predictions [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        """
+        hue = self.hue_branch(x)
+        value = self.value_branch(x)
+        chroma = self.chroma_branch(x)
+        code = self.code_branch(x)
+        return torch.cat([hue, value, chroma, code], dim=1)
+class MultiResNetToMunsell(nn.Module):
+    """
+    Multi-ResNet for xyY to Munsell conversion with true skip connections.
+    Uses 4 independent ComponentResNet branches, one for each Munsell component.
+    Each branch contains actual residual blocks with skip connections.
+    Parameters
+    ----------
+    hidden_dim : int, optional
+        Hidden dimension for residual blocks. Default is 256.
+    num_blocks : int, optional
+        Number of residual blocks per branch. Default is 4.
+    chroma_hidden_dim : int, optional
+        Hidden dimension for chroma branch (typically larger). Default is 512.
+    Attributes
+    ----------
+    hue_branch : ComponentResNet
+        ResNet for hue component.
+    value_branch : ComponentResNet
+        ResNet for value component.
+    chroma_branch : ComponentResNet
+        ResNet for chroma component (larger hidden dim).
+    code_branch : ComponentResNet
+        ResNet for hue code component.
+    """
+    def __init__(
+        self,
+        hidden_dim: int = 256,
+        num_blocks: int = 4,
+        chroma_hidden_dim: int = 512,
+    ) -> None:
+        """Initialize the multi-branch ResNet model."""
+        super().__init__()
+        self.hue_branch = ComponentResNet(
+            input_dim=3, hidden_dim=hidden_dim, num_blocks=num_blocks
+        )
+        self.value_branch = ComponentResNet(
+            input_dim=3, hidden_dim=hidden_dim, num_blocks=num_blocks
+        )
+        self.chroma_branch = ComponentResNet(
+            input_dim=3, hidden_dim=chroma_hidden_dim, num_blocks=num_blocks
+        )
+        self.code_branch = ComponentResNet(
+            input_dim=3, hidden_dim=hidden_dim, num_blocks=num_blocks
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through all 4 independent ResNet branches.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, 3) containing normalized xyY values.
+        Returns
+        -------
+        Tensor
+            Concatenated predictions [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        """
+        hue = self.hue_branch(x)
+        value = self.value_branch(x)
+        chroma = self.chroma_branch(x)
+        code = self.code_branch(x)
+        return torch.cat([hue, value, chroma, code], dim=1)
+class TransformerToMunsell(nn.Module):
+    """
+    Transformer for xyY to Munsell conversion.
+    Uses a feature tokenizer to convert input features to embeddings,
+    followed by transformer blocks with self-attention, and separate
+    output heads for each Munsell component.
+    Parameters
+    ----------
+    num_features : int, optional
+        Number of input features (default is 3 for xyY).
+    embedding_dim : int, optional
+        Dimension of token embeddings (default is 256).
+    num_blocks : int, optional
+        Number of transformer blocks (default is 6).
+    num_heads : int, optional
+        Number of attention heads (default is 8).
+    ff_dim : int, optional
+        Feedforward network hidden dimension (default is 1024).
+    dropout : float, optional
+        Dropout probability (default is 0.1).
+    Attributes
+    ----------
+    tokenizer : FeatureTokenizer
+        Converts input features to token embeddings with CLS token.
+    transformer_blocks : nn.ModuleList
+        Stack of transformer blocks with self-attention.
+    final_norm : nn.LayerNorm
+        Final layer normalization before output heads.
+    hue_head : nn.Sequential
+        Output head for hue prediction.
+    value_head : nn.Sequential
+        Output head for value prediction.
+    chroma_head : nn.Sequential
+        Deeper output head for chroma prediction.
+    code_head : nn.Sequential
+        Output head for hue code prediction.
+    Notes
+    -----
+    Architecture: 3 xyY features → 3 tokens + 1 CLS token → transformer blocks
+    with self-attention → multi-head output with specialized component heads.
+    The chroma head has additional depth due to prediction difficulty.
+    """
+    def __init__(
+        self,
+        num_features: int = 3,
+        embedding_dim: int = 256,
+        num_blocks: int = 6,
+        num_heads: int = 8,
+        ff_dim: int = 1024,
+        dropout: float = 0.1,
+    ) -> None:
+        """Initialize the transformer model."""
+        super().__init__()
+        self.tokenizer = FeatureTokenizer(num_features, embedding_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(embedding_dim, num_heads, ff_dim, dropout)
+                for _ in range(num_blocks)
+            ]
+        )
+        self.final_norm = nn.LayerNorm(embedding_dim)
+        # Multi-head output - separate heads for each Munsell component
+        self.hue_head = nn.Sequential(
+            nn.Linear(embedding_dim, 128),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, 1),
+        )
+        self.value_head = nn.Sequential(
+            nn.Linear(embedding_dim, 128),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, 1),
+        )
+        self.chroma_head = nn.Sequential(
+            nn.Linear(embedding_dim, 256),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(256, 128),
+            nn.GELU(),
+            nn.Linear(128, 1),
+        )
+        self.code_head = nn.Sequential(
+            nn.Linear(embedding_dim, 128),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, 1),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through the transformer.
+        Parameters
+        ----------
+        x : Tensor
+            Input xyY values of shape (batch_size, 3).
+        Returns
+        -------
+        Tensor
+            Predicted Munsell specification [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        Notes
+        -----
+        The CLS token representation is used for the final prediction through
+        separate task-specific heads for each Munsell component.
+        """
+        tokens = self.tokenizer(x)
+        for block in self.transformer_blocks:
+            tokens = block(tokens)
+        tokens = self.final_norm(tokens)
+        cls_token = tokens[:, 0, :]
+        hue = self.hue_head(cls_token)
+        value = self.value_head(cls_token)
+        chroma = self.chroma_head(cls_token)
+        code = self.code_head(cls_token)
+        return torch.cat([hue, value, chroma, code], dim=1)
+# =============================================================================
+# Error Predictors: xyY → Munsell
+# =============================================================================
+class MultiHeadErrorPredictorToMunsell(nn.Module):
+    """
+    Multi-Head error predictor for xyY to Munsell conversion.
+    Each branch is a ComponentErrorPredictor specialized for one
+    Munsell component. The chroma branch is wider (1.5x) to handle
+    the more complex error patterns in chroma prediction.
+    Parameters
+    ----------
+    input_dim : int, optional
+        Input feature dimension. Default is 7.
+    chroma_width : float, optional
+        Width multiplier for chroma branch. Default is 1.5.
+    Attributes
+    ----------
+    hue_branch : ComponentErrorPredictor
+        Error predictor for hue component (1.0x width).
+    value_branch : ComponentErrorPredictor
+        Error predictor for value component (1.0x width).
+    chroma_branch : ComponentErrorPredictor
+        Error predictor for chroma component (1.5x width by default).
+    code_branch : ComponentErrorPredictor
+        Error predictor for hue code component (1.0x width).
+    """
+    def __init__(
+        self,
+        input_dim: int = 7,
+        chroma_width: float = 1.5,
+    ) -> None:
+        """Initialize the multi-head error predictor."""
+        super().__init__()
+        # Independent error predictor for each component
+        self.hue_branch = ComponentErrorPredictor(
+            input_dim=input_dim, width_multiplier=1.0
+        )
+        self.value_branch = ComponentErrorPredictor(
+            input_dim=input_dim, width_multiplier=1.0
+        )
+        self.chroma_branch = ComponentErrorPredictor(
+            input_dim=input_dim, width_multiplier=chroma_width
+        )
+        self.code_branch = ComponentErrorPredictor(
+            input_dim=input_dim, width_multiplier=1.0
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through all error predictor branches.
+        Parameters
+        ----------
+        x : Tensor
+            Combined input of shape (batch_size, input_dim).
+        Returns
+        -------
+        Tensor
+            Concatenated error corrections [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        """
+        # Each branch processes the same combined input independently
+        hue_error = self.hue_branch(x)
+        value_error = self.value_branch(x)
+        chroma_error = self.chroma_branch(x)
+        code_error = self.code_branch(x)
+        # Concatenate: [Hue_error, Value_error, Chroma_error, Code_error]
+        return torch.cat([hue_error, value_error, chroma_error, code_error], dim=1)
+class MultiMLPErrorPredictorToMunsell(nn.Module):
+    """
+    Multi-MLP error predictor for xyY to Munsell conversion.
+    Uses 4 independent ComponentErrorPredictor branches, one for each
+    Munsell component error.
+    Parameters
+    ----------
+    chroma_width : float, optional
+        Width multiplier for chroma branch. Default is 1.5.
+    Attributes
+    ----------
+    hue_branch : ComponentErrorPredictor
+        Error predictor for hue component (1.0x width).
+    value_branch : ComponentErrorPredictor
+        Error predictor for value component (1.0x width).
+    chroma_branch : ComponentErrorPredictor
+        Error predictor for chroma component (configurable width).
+    code_branch : ComponentErrorPredictor
+        Error predictor for hue code component (1.0x width).
+    """
+    def __init__(self, chroma_width: float = 1.5) -> None:
+        """Initialize the multi-head error predictor."""
+        super().__init__()
+        self.hue_branch = ComponentErrorPredictor(input_dim=7, width_multiplier=1.0)
+        self.value_branch = ComponentErrorPredictor(input_dim=7, width_multiplier=1.0)
+        self.chroma_branch = ComponentErrorPredictor(
+            input_dim=7, width_multiplier=chroma_width
+        )
+        self.code_branch = ComponentErrorPredictor(input_dim=7, width_multiplier=1.0)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through all error predictor branches.
+        Parameters
+        ----------
+        x : Tensor
+            Combined input [xyY_norm, base_pred_norm] of shape (batch_size, 7).
+        Returns
+        -------
+        Tensor
+            Concatenated error corrections [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        """
+        hue_error = self.hue_branch(x)
+        value_error = self.value_branch(x)
+        chroma_error = self.chroma_branch(x)
+        code_error = self.code_branch(x)
+        return torch.cat([hue_error, value_error, chroma_error, code_error], dim=1)
+class MultiResNetErrorPredictorToMunsell(nn.Module):
+    """
+    Multi-ResNet error predictor for xyY to Munsell conversion.
+    Uses 4 independent ComponentResNet branches with true skip connections,
+    one for each Munsell component error.
+    Parameters
+    ----------
+    hidden_dim : int, optional
+        Hidden dimension for residual blocks. Default is 256.
+    num_blocks : int, optional
+        Number of residual blocks per branch. Default is 4.
+    chroma_hidden_dim : int, optional
+        Hidden dimension for chroma branch. Default is 384.
+    Attributes
+    ----------
+    hue_branch : ComponentResNet
+        ResNet error predictor for hue component.
+    value_branch : ComponentResNet
+        ResNet error predictor for value component.
+    chroma_branch : ComponentResNet
+        ResNet error predictor for chroma component.
+    code_branch : ComponentResNet
+        ResNet error predictor for code component.
+    """
+    def __init__(
+        self,
+        hidden_dim: int = 256,
+        num_blocks: int = 4,
+        chroma_hidden_dim: int = 384,
+    ) -> None:
+        """Initialize the multi-ResNet error predictor."""
+        super().__init__()
+        # Input: xyY (3) + base prediction (4) = 7
+        self.hue_branch = ComponentResNet(
+            input_dim=7, hidden_dim=hidden_dim, num_blocks=num_blocks
+        )
+        self.value_branch = ComponentResNet(
+            input_dim=7, hidden_dim=hidden_dim, num_blocks=num_blocks
+        )
+        self.chroma_branch = ComponentResNet(
+            input_dim=7, hidden_dim=chroma_hidden_dim, num_blocks=num_blocks
+        )
+        self.code_branch = ComponentResNet(
+            input_dim=7, hidden_dim=hidden_dim, num_blocks=num_blocks
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through all error predictor branches.
+        Parameters
+        ----------
+        x : Tensor
+            Combined input [xyY_norm, base_pred_norm] of shape (batch_size, 7).
+        Returns
+        -------
+        Tensor
+            Concatenated error corrections [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        """
+        hue_error = self.hue_branch(x)
+        value_error = self.value_branch(x)
+        chroma_error = self.chroma_branch(x)
+        code_error = self.code_branch(x)
+        return torch.cat([hue_error, value_error, chroma_error, code_error], dim=1)
+# =============================================================================
+# Composite Models: Munsell → xyY
+# =============================================================================
+class MultiMLPToxyY(nn.Module):
+    """
+    Multi-MLP for Munsell to xyY conversion.
+    Uses 3 independent ComponentMLP branches, one for each xyY component.
+    Parameters
+    ----------
+    width_multiplier : float, optional
+        Width multiplier for x and y branches. Default is 1.0.
+    y_width_multiplier : float, optional
+        Width multiplier for Y (luminance) branch. Default is 1.25.
+    Attributes
+    ----------
+    x_branch : ComponentMLP
+        MLP for x chromaticity component.
+    y_branch : ComponentMLP
+        MLP for y chromaticity component.
+    Y_branch : ComponentMLP
+        MLP for Y luminance component.
+    """
+    def __init__(
+        self, width_multiplier: float = 1.0, y_width_multiplier: float = 1.25
+    ) -> None:
+        """Initialize the multi-MLP model."""
+        super().__init__()
+        self.x_branch = ComponentMLP(input_dim=4, width_multiplier=width_multiplier)
+        self.y_branch = ComponentMLP(input_dim=4, width_multiplier=width_multiplier)
+        self.Y_branch = ComponentMLP(
+            input_dim=4, width_multiplier=y_width_multiplier
+        )
+    def forward(self, munsell: Tensor) -> Tensor:
+        """
+        Forward pass through all branches.
+        Parameters
+        ----------
+        munsell : Tensor
+            Normalized Munsell specification [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        Returns
+        -------
+        Tensor
+            Predicted xyY values [x, y, Y] of shape (batch_size, 3).
+        """
+        x = self.x_branch(munsell)
+        y = self.y_branch(munsell)
+        Y = self.Y_branch(munsell)
+        return torch.cat([x, y, Y], dim=1)
+# =============================================================================
+# Error Predictors: Munsell → xyY
+# =============================================================================
+class MultiMLPErrorPredictorToxyY(nn.Module):
+    """
+    Multi-MLP error predictor for Munsell to xyY conversion.
+    Uses 3 independent ComponentErrorPredictor branches, one for each
+    xyY component error.
+    Parameters
+    ----------
+    width_multiplier : float, optional
+        Width multiplier for all branches. Default is 1.0.
+    Attributes
+    ----------
+    x_branch : ComponentErrorPredictor
+        Error predictor for x chromaticity component.
+    y_branch : ComponentErrorPredictor
+        Error predictor for y chromaticity component.
+    Y_branch : ComponentErrorPredictor
+        Error predictor for Y luminance component.
+    """
+    def __init__(self, width_multiplier: float = 1.0) -> None:
+        """Initialize the multi-head error predictor."""
+        super().__init__()
+        self.x_branch = ComponentErrorPredictor(
+            input_dim=7, width_multiplier=width_multiplier
+        )
+        self.y_branch = ComponentErrorPredictor(
+            input_dim=7, width_multiplier=width_multiplier
+        )
+        self.Y_branch = ComponentErrorPredictor(
+            input_dim=7, width_multiplier=width_multiplier
+        )
+    def forward(self, combined_input: Tensor) -> Tensor:
+        """
+        Forward pass through all error predictor branches.
+        Parameters
+        ----------
+        combined_input : Tensor
+            Combined input [munsell_norm, base_pred] of shape (batch_size, 7).
+        Returns
+        -------
+        Tensor
+            Concatenated error corrections [x, y, Y] of shape (batch_size, 3).
+        """
+        x_error = self.x_branch(combined_input)
+        y_error = self.y_branch(combined_input)
+        Y_error = self.Y_branch(combined_input)
+        return torch.cat([x_error, y_error, Y_error], dim=1)

learning_munsell/training/from_xyY/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Training scripts for xyY to Munsell conversion."""

learning_munsell/training/from_xyY/hyperparameter_search_error_predictor.py ADDED Viewed

	@@ -0,0 +1,503 @@

+"""
+Hyperparameter search for Multi-Error Predictor using Optuna.
+Optimizes:
+- Learning rate
+- Batch size
+- Chroma width multiplier
+- Loss function weights (MSE, MAE, log penalty, Huber)
+- Huber delta
+- Dropout
+Objective: Minimize validation loss
+"""
+import logging
+from datetime import datetime
+from pathlib import Path
+import mlflow
+import numpy as np
+import onnxruntime as ort
+import optuna
+import torch
+from numpy.typing import NDArray
+from optuna.trial import Trial
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import (
+    ComponentErrorPredictor,
+    MultiMLPErrorPredictorToMunsell,
+)
+from learning_munsell.utilities.common import setup_mlflow_experiment
+from learning_munsell.utilities.data import normalize_xyY, normalize_munsell
+LOGGER = logging.getLogger(__name__)
+def precision_focused_loss(
+    pred: torch.Tensor,
+    target: torch.Tensor,
+    mse_weight: float = 1.0,
+    mae_weight: float = 0.5,
+    log_weight: float = 0.3,
+    huber_weight: float = 0.5,
+    huber_delta: float = 0.01,
+) -> torch.Tensor:
+    """
+    Precision-focused loss function with configurable weights.
+    Combines multiple loss components to encourage accurate error prediction:
+    - MSE: Standard mean squared error
+    - MAE: Mean absolute error for robustness
+    - Log penalty: Penalizes small errors more heavily
+    - Huber loss: Robust to outliers with adjustable delta
+    Parameters
+    ----------
+    pred : torch.Tensor
+        Predicted values, shape (batch_size, n_components).
+    target : torch.Tensor
+        Target values, shape (batch_size, n_components).
+    mse_weight : float, optional
+        Weight for MSE component. Default is 1.0.
+    mae_weight : float, optional
+        Weight for MAE component. Default is 0.5.
+    log_weight : float, optional
+        Weight for logarithmic penalty component. Default is 0.3.
+    huber_weight : float, optional
+        Weight for Huber loss component. Default is 0.5.
+    huber_delta : float, optional
+        Delta parameter for Huber loss transition point. Default is 0.01.
+    Returns
+    -------
+    torch.Tensor
+        Weighted combination of loss components, scalar tensor.
+    """
+    mse = torch.mean((pred - target) ** 2)
+    mae = torch.mean(torch.abs(pred - target))
+    log_penalty = torch.mean(torch.log1p(torch.abs(pred - target) * 1000.0))
+    abs_error = torch.abs(pred - target)
+    huber = torch.where(
+        abs_error <= huber_delta,
+        0.5 * abs_error**2,
+        huber_delta * (abs_error - 0.5 * huber_delta),
+    )
+    huber_loss = torch.mean(huber)
+    return (
+        mse_weight * mse
+        + mae_weight * mae
+        + log_weight * log_penalty
+        + huber_weight * huber_loss
+    )
+def load_base_model(
+    model_path: Path, params_path: Path
+) -> tuple[ort.InferenceSession, dict, dict]:
+    """
+    Load the base ONNX model and its normalization parameters.
+    Parameters
+    ----------
+    model_path : Path
+        Path to the base model ONNX file.
+    params_path : Path
+        Path to the normalization parameters NPZ file.
+    Returns
+    -------
+    ort.InferenceSession
+        ONNX Runtime inference session for the base model.
+    dict
+        Input normalization parameters (x_range, y_range, Y_range).
+    dict
+        Output normalization parameters (hue_range, value_range, chroma_range, code_range).
+    """
+    session = ort.InferenceSession(str(model_path))
+    params = np.load(params_path, allow_pickle=True)
+    return session, params["input_params"].item(), params["output_params"].item()
+def train_epoch(
+    model: nn.Module,
+    dataloader: DataLoader,
+    optimizer: optim.Optimizer,
+    device: torch.device,
+    loss_params: dict[str, float],
+) -> float:
+    """
+    Train the model for one epoch.
+    Parameters
+    ----------
+    model : nn.Module
+        Error predictor model to train.
+    dataloader : DataLoader
+        DataLoader providing training batches.
+    optimizer : optim.Optimizer
+        Optimizer for updating model parameters.
+    device : torch.device
+        Device to run training on (CPU, CUDA, or MPS).
+    loss_params : dict of str to float
+        Parameters for precision_focused_loss function.
+    Returns
+    -------
+    float
+        Average training loss over the epoch.
+    """
+    model.train()
+    total_loss = 0.0
+    for X_batch, y_batch in dataloader:
+        X_batch = X_batch.to(device)
+        y_batch = y_batch.to(device)
+        outputs = model(X_batch)
+        loss = precision_focused_loss(outputs, y_batch, **loss_params)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(dataloader)
+def validate(
+    model: nn.Module,
+    dataloader: DataLoader,
+    device: torch.device,
+    loss_params: dict[str, float],
+) -> float:
+    """
+    Validate the model on the validation set.
+    Parameters
+    ----------
+    model : nn.Module
+        Error predictor model to validate.
+    dataloader : DataLoader
+        DataLoader providing validation batches.
+    device : torch.device
+        Device to run validation on (CPU, CUDA, or MPS).
+    loss_params : dict of str to float
+        Parameters for precision_focused_loss function.
+    Returns
+    -------
+    float
+        Average validation loss.
+    """
+    model.eval()
+    total_loss = 0.0
+    with torch.no_grad():
+        for X_batch, y_batch in dataloader:
+            X_batch = X_batch.to(device)
+            y_batch = y_batch.to(device)
+            outputs = model(X_batch)
+            loss = precision_focused_loss(outputs, y_batch, **loss_params)
+            total_loss += loss.item()
+    return total_loss / len(dataloader)
+def objective(trial: Trial) -> float:
+    """
+    Optuna objective function to minimize validation loss.
+    This function defines the hyperparameter search space and training
+    procedure for each trial. It optimizes:
+    - Learning rate (5e-4 to 1e-3, log scale)
+    - Batch size (512 or 1024)
+    - Chroma branch width multiplier (1.0 to 1.5)
+    - Dropout rate (0.1 to 0.2)
+    - Loss function weights (MSE, Huber)
+    - Huber delta parameter (0.01 to 0.05)
+    Parameters
+    ----------
+    trial : Trial
+        Optuna trial object for suggesting hyperparameters.
+    Returns
+    -------
+    float
+        Best validation loss achieved during training.
+    Raises
+    ------
+    FileNotFoundError
+        If base model or training data files are not found.
+    optuna.TrialPruned
+        If trial is pruned based on intermediate results.
+    """
+    # Hyperparameters to optimize - constrained based on Trial 0 insights
+    lr = trial.suggest_float("lr", 5e-4, 1e-3, log=True)  # Higher LR worked well
+    batch_size = trial.suggest_categorical(
+        "batch_size", [512, 1024]
+    )  # Smaller batches better
+    chroma_width = trial.suggest_float(
+        "chroma_width", 1.0, 1.5, step=0.25
+    )  # Smaller worked
+    dropout = trial.suggest_float("dropout", 0.1, 0.2, step=0.05)
+    # Simplified loss - just MSE + optional small Huber (no log penalty!)
+    mse_weight = trial.suggest_float("mse_weight", 1.0, 2.0, step=0.25)
+    huber_weight = trial.suggest_float("huber_weight", 0.0, 0.5, step=0.25)
+    huber_delta = trial.suggest_float("huber_delta", 0.01, 0.05, step=0.01)
+    loss_params = {
+        "mse_weight": mse_weight,
+        "mae_weight": 0.0,  # Fixed at 0
+        "log_weight": 0.0,  # Fixed at 0 (was causing scale issues)
+        "huber_weight": huber_weight,
+        "huber_delta": huber_delta,
+    }
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Trial %d", trial.number)
+    LOGGER.info("=" * 80)
+    LOGGER.info("  lr: %.6f", lr)
+    LOGGER.info("  batch_size: %d", batch_size)
+    LOGGER.info("  chroma_width: %.2f", chroma_width)
+    LOGGER.info("  dropout: %.2f", dropout)
+    LOGGER.info("  mse_weight: %.2f", mse_weight)
+    LOGGER.info("  huber_weight: %.2f", huber_weight)
+    LOGGER.info("  huber_delta: %.3f", huber_delta)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load base model and data
+    model_dir = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    base_model_path = model_dir / "multi_mlp.onnx"
+    params_path = model_dir / "multi_mlp_normalization_params.npz"
+    cache_file = data_dir / "training_data.npz"
+    if not base_model_path.exists():
+        msg = f"Base model not found: {base_model_path}"
+        raise FileNotFoundError(msg)
+    base_session, input_params, output_params = load_base_model(
+        base_model_path, params_path
+    )
+    # Load data
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    # Normalize and generate base predictions
+    X_train_norm = normalize_xyY(X_train, input_params)
+    y_train_norm = normalize_munsell(y_train, output_params)
+    base_pred_train_norm = base_session.run(None, {"xyY": X_train_norm})[0]
+    X_val_norm = normalize_xyY(X_val, input_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    base_pred_val_norm = base_session.run(None, {"xyY": X_val_norm})[0]
+    # Compute errors
+    error_train = y_train_norm - base_pred_train_norm
+    error_val = y_val_norm - base_pred_val_norm
+    # Combined input
+    X_train_combined = np.concatenate([X_train_norm, base_pred_train_norm], axis=1)
+    X_val_combined = np.concatenate([X_val_norm, base_pred_val_norm], axis=1)
+    # PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_combined)
+    error_train_t = torch.FloatTensor(error_train)
+    X_val_t = torch.FloatTensor(X_val_combined)
+    error_val_t = torch.FloatTensor(error_val)
+    # Data loaders
+    train_dataset = TensorDataset(X_train_t, error_train_t)
+    val_dataset = TensorDataset(X_val_t, error_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MultiMLPErrorPredictorToMunsell(chroma_width=chroma_width, dropout=dropout).to(
+        device
+    )
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("  Total parameters: %s", f"{total_params:,}")
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    # MLflow setup
+    run_name = setup_mlflow_experiment(
+        "from_xyY", f"hparam_error_predictor_trial_{trial.number}"
+    )
+    # Training loop
+    num_epochs = 100
+    patience = 15
+    best_val_loss = float("inf")
+    patience_counter = 0
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "trial": trial.number,
+                "lr": lr,
+                "batch_size": batch_size,
+                "chroma_width": chroma_width,
+                "dropout": dropout,
+                "mse_weight": mse_weight,
+                "huber_weight": huber_weight,
+                "huber_delta": huber_delta,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(num_epochs):
+            train_loss = train_epoch(
+                model, train_loader, optimizer, device, loss_params
+            )
+            val_loss = validate(model, val_loader, device, loss_params)
+            mlflow.log_metrics(
+                {
+                    "train_loss": train_loss,
+                    "val_loss": val_loss,
+                },
+                step=epoch,
+            )
+            if (epoch + 1) % 10 == 0:
+                LOGGER.info(
+                    "  Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f",
+                    epoch + 1,
+                    num_epochs,
+                    train_loss,
+                    val_loss,
+                )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("  Early stopping at epoch %d", epoch + 1)
+                    break
+            trial.report(val_loss, epoch)
+            if trial.should_prune():
+                LOGGER.info("  Trial pruned at epoch %d", epoch + 1)
+                mlflow.log_metrics({"pruned": 1, "pruned_epoch": epoch})
+                raise optuna.TrialPruned
+        # Log final results
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_train_loss": train_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+    LOGGER.info("  Final validation loss: %.6f", best_val_loss)
+    return best_val_loss
+def main() -> None:
+    """
+    Run hyperparameter search for Multi-MLP Error Predictor.
+    Performs systematic hyperparameter optimization using Optuna with:
+    - MedianPruner for early stopping of unpromising trials
+    - 15 total trials
+    - MLflow logging for each trial
+    - Result visualization and saving
+    The search aims to find optimal hyperparameters for predicting errors
+    in a base Munsell prediction model, which can then be used to improve
+    predictions by correcting systematic biases.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Error Predictor Hyperparameter Search with Optuna")
+    LOGGER.info("=" * 80)
+    study = optuna.create_study(
+        direction="minimize",
+        study_name="multi_mlp_error_predictor_hparam_search",
+        pruner=optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=10),
+    )
+    n_trials = 15
+    LOGGER.info("")
+    LOGGER.info("Starting hyperparameter search with %d trials...", n_trials)
+    LOGGER.info("")
+    study.optimize(objective, n_trials=n_trials, timeout=None)
+    # Print results
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Hyperparameter Search Results")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Best trial:")
+    LOGGER.info("  Value (val_loss): %.6f", study.best_value)
+    LOGGER.info("")
+    LOGGER.info("Best hyperparameters:")
+    for key, value in study.best_params.items():
+        LOGGER.info("  %s: %s", key, value)
+    # Save results
+    results_dir = PROJECT_ROOT / "results" / "from_xyY"
+    results_dir.mkdir(exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_file = results_dir / f"error_predictor_hparam_search_{timestamp}.txt"
+    with open(results_file, "w") as f:
+        f.write("=" * 80 + "\n")
+        f.write("Multi-Error Predictor Hyperparameter Search Results\n")
+        f.write("=" * 80 + "\n\n")
+        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Number of trials: {len(study.trials)}\n")
+        f.write(f"Best validation loss: {study.best_value:.6f}\n\n")
+        f.write("Best hyperparameters:\n")
+        for key, value in study.best_params.items():
+            f.write(f"  {key}: {value}\n")
+        f.write("\n\nAll trials:\n")
+        f.write("-" * 80 + "\n")
+        for trial in study.trials:
+            f.write(f"\nTrial {trial.number}:\n")
+            f.write(f"  Value: {trial.value:.6f if trial.value else 'Pruned'}\n")
+            f.write("  Params:\n")
+            for key, value in trial.params.items():
+                f.write(f"    {key}: {value}\n")
+    LOGGER.info("")
+    LOGGER.info("Results saved to: %s", results_file)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/hyperparameter_search_multi_head.py ADDED Viewed

	@@ -0,0 +1,541 @@

+"""
+Hyperparameter search for Multi-Head model (xyY to Munsell) using Optuna.
+Optimizes:
+- Learning rate
+- Batch size
+- Encoder width multiplier (shared encoder capacity)
+- Head width multiplier (component-specific head capacity)
+- Chroma head width (specialized for chroma prediction)
+- Dropout
+- Weight decay
+Objective: Minimize validation loss
+"""
+from __future__ import annotations
+import logging
+from datetime import datetime
+import matplotlib.pyplot as plt
+import mlflow
+import numpy as np
+import optuna
+import torch
+from optuna.trial import Trial
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.utilities.common import setup_mlflow_experiment
+from learning_munsell.utilities.data import MUNSELL_NORMALIZATION_PARAMS, normalize_munsell
+from learning_munsell.utilities.losses import weighted_mse_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+class MultiHeadParametric(nn.Module):
+    """
+    Parametric Multi-Head model for hyperparameter search (xyY to Munsell).
+    This model uses a shared encoder to extract general color space features
+    from xyY inputs, followed by component-specific heads for predicting
+    each Munsell component independently.
+    Architecture:
+    - Shared encoder: 3 → h1 → h2 → h3 (scaled by encoder_width)
+    - hue, value, code heads: h3 → h2' → h1' → 1 (scaled by head_width)
+    - chroma head: h3 → h2'' → h1'' → 1 (scaled by chroma_head_width)
+    Parameters
+    ----------
+    encoder_width : float, optional
+        Width multiplier for shared encoder layers. Default is 1.0.
+        Base dimensions: h1=128, h2=256, h3=512.
+    head_width : float, optional
+        Width multiplier for hue, value, and code heads. Default is 1.0.
+        Base dimensions: h1=128, h2=256.
+    chroma_head_width : float, optional
+        Width multiplier for chroma head (typically wider). Default is 1.0.
+        Base dimensions: h1=128, h2=256, h3=384.
+    dropout : float, optional
+        Dropout rate applied after hidden layers. Default is 0.0.
+    """
+    def __init__(
+        self,
+        encoder_width: float = 1.0,
+        head_width: float = 1.0,
+        chroma_head_width: float = 1.0,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        # Encoder dimensions (shared)
+        e_h1 = int(128 * encoder_width)
+        e_h2 = int(256 * encoder_width)
+        e_h3 = int(512 * encoder_width)
+        # Head dimensions (component-specific)
+        h_h1 = int(128 * head_width)
+        h_h2 = int(256 * head_width)
+        # Chroma head dimensions (specialized)
+        c_h1 = int(128 * chroma_head_width)
+        c_h2 = int(256 * chroma_head_width)
+        c_h3 = int(384 * chroma_head_width)
+        # Shared encoder - learns general color space features
+        encoder_layers = [
+            nn.Linear(3, e_h1),
+            nn.ReLU(),
+            nn.BatchNorm1d(e_h1),
+        ]
+        if dropout > 0:
+            encoder_layers.append(nn.Dropout(dropout))
+        encoder_layers.extend(
+            [
+                nn.Linear(e_h1, e_h2),
+                nn.ReLU(),
+                nn.BatchNorm1d(e_h2),
+            ]
+        )
+        if dropout > 0:
+            encoder_layers.append(nn.Dropout(dropout))
+        encoder_layers.extend(
+            [
+                nn.Linear(e_h2, e_h3),
+                nn.ReLU(),
+                nn.BatchNorm1d(e_h3),
+            ]
+        )
+        if dropout > 0:
+            encoder_layers.append(nn.Dropout(dropout))
+        self.encoder = nn.Sequential(*encoder_layers)
+        # Component-specific heads (hue, value, code)
+        def create_head() -> nn.Sequential:
+            head_layers = [
+                nn.Linear(e_h3, h_h2),
+                nn.ReLU(),
+                nn.BatchNorm1d(h_h2),
+            ]
+            if dropout > 0:
+                head_layers.append(nn.Dropout(dropout))
+            head_layers.extend(
+                [
+                    nn.Linear(h_h2, h_h1),
+                    nn.ReLU(),
+                    nn.BatchNorm1d(h_h1),
+                ]
+            )
+            if dropout > 0:
+                head_layers.append(nn.Dropout(dropout))
+            head_layers.append(nn.Linear(h_h1, 1))
+            return nn.Sequential(*head_layers)
+        self.hue_head = create_head()
+        self.value_head = create_head()
+        self.code_head = create_head()
+        # Chroma head - wider for harder task
+        chroma_layers = [
+            nn.Linear(e_h3, c_h3),
+            nn.ReLU(),
+            nn.BatchNorm1d(c_h3),
+        ]
+        if dropout > 0:
+            chroma_layers.append(nn.Dropout(dropout))
+        chroma_layers.extend(
+            [
+                nn.Linear(c_h3, c_h2),
+                nn.ReLU(),
+                nn.BatchNorm1d(c_h2),
+            ]
+        )
+        if dropout > 0:
+            chroma_layers.append(nn.Dropout(dropout))
+        chroma_layers.extend(
+            [
+                nn.Linear(c_h2, c_h1),
+                nn.ReLU(),
+                nn.BatchNorm1d(c_h1),
+            ]
+        )
+        if dropout > 0:
+            chroma_layers.append(nn.Dropout(dropout))
+        chroma_layers.append(nn.Linear(c_h1, 1))
+        self.chroma_head = nn.Sequential(*chroma_layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through shared encoder and component-specific heads.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, 3) containing normalized
+            xyY values.
+        Returns
+        -------
+        torch.Tensor
+            Predicted Munsell components, shape (batch_size, 4).
+            Output order: [hue, value, chroma, code].
+        """
+        # Shared feature extraction
+        features = self.encoder(x)
+        # Component-specific predictions
+        hue = self.hue_head(features)
+        value = self.value_head(features)
+        chroma = self.chroma_head(features)
+        code = self.code_head(features)
+        # Concatenate: [hue, value, chroma, code]
+        return torch.cat([hue, value, chroma, code], dim=1)
+def objective(trial: Trial) -> float:
+    """
+    Optuna objective function to minimize validation loss.
+    This function defines the hyperparameter search space and training
+    procedure for each trial. It optimizes:
+    - Learning rate (1e-4 to 1e-3, log scale)
+    - Batch size (256, 512, or 1024)
+    - Encoder width multiplier (0.75 to 1.5)
+    - Head width multiplier (0.75 to 1.5)
+    - Chroma head width multiplier (1.0 to 1.75)
+    - Dropout rate (0.0 to 0.2)
+    - Weight decay (1e-5 to 1e-3, log scale)
+    Parameters
+    ----------
+    trial : Trial
+        Optuna trial object for suggesting hyperparameters.
+    Returns
+    -------
+    float
+        Best validation loss achieved during training.
+    Raises
+    ------
+    optuna.TrialPruned
+        If trial is pruned based on intermediate results.
+    """
+    # Suggest hyperparameters
+    lr = trial.suggest_float("lr", 1e-4, 1e-3, log=True)
+    batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
+    encoder_width = trial.suggest_float("encoder_width", 0.75, 1.5, step=0.25)
+    head_width = trial.suggest_float("head_width", 0.75, 1.5, step=0.25)
+    chroma_head_width = trial.suggest_float("chroma_head_width", 1.0, 1.75, step=0.25)
+    dropout = trial.suggest_float("dropout", 0.0, 0.2, step=0.05)
+    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Trial %d", trial.number)
+    LOGGER.info("=" * 80)
+    LOGGER.info("  lr: %.6f", lr)
+    LOGGER.info("  batch_size: %d", batch_size)
+    LOGGER.info("  encoder_width: %.2f", encoder_width)
+    LOGGER.info("  head_width: %.2f", head_width)
+    LOGGER.info("  chroma_head_width: %.2f", chroma_head_width)
+    LOGGER.info("  dropout: %.2f", dropout)
+    LOGGER.info("  weight_decay: %.6f", weight_decay)
+    # Set device
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    LOGGER.info("  device: %s", device)
+    # Load data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to tensors
+    X_train_t = torch.from_numpy(X_train).float()
+    y_train_t = torch.from_numpy(y_train_norm).float()
+    X_val_t = torch.from_numpy(X_val).float()
+    y_val_t = torch.from_numpy(y_val_norm).float()
+    train_loader = DataLoader(
+        TensorDataset(X_train_t, y_train_t), batch_size=batch_size, shuffle=True
+    )
+    val_loader = DataLoader(
+        TensorDataset(X_val_t, y_val_t), batch_size=batch_size, shuffle=False
+    )
+    LOGGER.info(
+        "  Training samples: %d, Validation samples: %d", len(X_train_t), len(X_val_t)
+    )
+    # Initialize model
+    model = MultiHeadParametric(
+        encoder_width=encoder_width,
+        head_width=head_width,
+        chroma_head_width=chroma_head_width,
+        dropout=dropout,
+    ).to(device)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("  Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
+    # MLflow setup
+    run_name = setup_mlflow_experiment(
+        "from_xyY", f"hparam_multi_head_trial_{trial.number}"
+    )
+    # Training loop with early stopping
+    num_epochs = 100  # Reduced for hyperparameter search
+    patience = 15
+    best_val_loss = float("inf")
+    patience_counter = 0
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "trial": trial.number,
+                "lr": lr,
+                "batch_size": batch_size,
+                "encoder_width": encoder_width,
+                "head_width": head_width,
+                "chroma_head_width": chroma_head_width,
+                "dropout": dropout,
+                "weight_decay": weight_decay,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(num_epochs):
+            train_loss = train_epoch(
+                model, train_loader, optimizer, weighted_mse_loss, device
+            )
+            val_loss = validate(model, val_loader, weighted_mse_loss, device)
+            scheduler.step()
+            # Per-component MAE
+            with torch.no_grad():
+                pred_val = model(X_val_t.to(device))
+                mae = torch.mean(torch.abs(pred_val - y_val_t.to(device)), dim=0).cpu()
+            # Log to MLflow
+            mlflow.log_metrics(
+                {
+                    "train_loss": train_loss,
+                    "val_loss": val_loss,
+                    "mae_hue": mae[0].item(),
+                    "mae_value": mae[1].item(),
+                    "mae_chroma": mae[2].item(),
+                    "mae_code": mae[3].item(),
+                    "learning_rate": optimizer.param_groups[0]["lr"],
+                },
+                step=epoch,
+            )
+            if (epoch + 1) % 10 == 0:
+                LOGGER.info(
+                    "  Epoch %03d/%d - Train: %.6f, Val: %.6f - "
+                    "MAE: hue=%.6f, value=%.6f, chroma=%.6f, code=%.6f",
+                    epoch + 1,
+                    num_epochs,
+                    train_loss,
+                    val_loss,
+                    mae[0],
+                    mae[1],
+                    mae[2],
+                    mae[3],
+                )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("  Early stopping at epoch %d", epoch + 1)
+                    break
+            # Report intermediate value for pruning
+            trial.report(val_loss, epoch)
+            # Handle pruning
+            if trial.should_prune():
+                LOGGER.info("  Trial pruned at epoch %d", epoch + 1)
+                mlflow.log_metrics({"pruned": 1, "pruned_epoch": epoch})
+                raise optuna.TrialPruned
+        # Log final results
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_train_loss": train_loss,
+                "final_mae_hue": mae[0].item(),
+                "final_mae_value": mae[1].item(),
+                "final_mae_chroma": mae[2].item(),
+                "final_mae_code": mae[3].item(),
+                "final_epoch": epoch + 1,
+            }
+        )
+    LOGGER.info("  Final validation loss: %.6f", best_val_loss)
+    return best_val_loss
+def main() -> None:
+    """
+    Run hyperparameter search for Multi-Head model (xyY to Munsell).
+    Performs systematic hyperparameter optimization using Optuna with:
+    - MedianPruner for early stopping of unpromising trials
+    - 20 total trials
+    - MLflow logging for each trial
+    - Result visualization using matplotlib (optimization history,
+      parameter importances, parallel coordinate plot)
+    The search aims to find optimal hyperparameters for converting xyY
+    color coordinates to Munsell color specifications using a multi-head
+    architecture with shared encoder and component-specific heads.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Head (from_xyY) Hyperparameter Search with Optuna")
+    LOGGER.info("=" * 80)
+    # Create study
+    study = optuna.create_study(
+        direction="minimize",
+        study_name="multi_head_from_xyY_hparam_search",
+        pruner=optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=10),
+    )
+    # Run optimization
+    n_trials = 20  # Number of trials to run
+    LOGGER.info("")
+    LOGGER.info("Starting hyperparameter search with %d trials...", n_trials)
+    LOGGER.info("")
+    study.optimize(objective, n_trials=n_trials, timeout=None)
+    # Print results
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Hyperparameter Search Results")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Best trial:")
+    LOGGER.info("  Value (val_loss): %.6f", study.best_value)
+    LOGGER.info("")
+    LOGGER.info("Best hyperparameters:")
+    for key, value in study.best_params.items():
+        LOGGER.info("  %s: %s", key, value)
+    # Save results
+    results_dir = PROJECT_ROOT / "results" / "from_xyY"
+    results_dir.mkdir(exist_ok=True, parents=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_file = results_dir / f"hparam_search_multi_head_{timestamp}.txt"
+    with open(results_file, "w") as f:
+        f.write("=" * 80 + "\n")
+        f.write("Multi-Head (from_xyY) Hyperparameter Search Results\n")
+        f.write("=" * 80 + "\n\n")
+        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Number of trials: {len(study.trials)}\n")
+        f.write(f"Best validation loss: {study.best_value:.6f}\n\n")
+        f.write("Best hyperparameters:\n")
+        for key, value in study.best_params.items():
+            f.write(f"  {key}: {value}\n")
+        f.write("\n\nAll trials:\n")
+        f.write("-" * 80 + "\n")
+        for t in study.trials:
+            f.write(f"\nTrial {t.number}:\n")
+            if t.value is not None:
+                f.write(f"  Value: {t.value:.6f}\n")
+            else:
+                f.write("  Value: Pruned\n")
+            f.write("  Params:\n")
+            for key, value in t.params.items():
+                f.write(f"    {key}: {value}\n")
+    LOGGER.info("")
+    LOGGER.info("Results saved to: %s", results_file)
+    # Generate visualizations using matplotlib
+    from optuna.visualization.matplotlib import (
+        plot_optimization_history,
+        plot_param_importances,
+        plot_parallel_coordinate,
+    )
+    # Optimization history
+    ax = plot_optimization_history(study)
+    ax.figure.savefig(
+        results_dir / f"optimization_history_multi_head_{timestamp}.png", dpi=150
+    )
+    plt.close(ax.figure)
+    # Parameter importances
+    ax = plot_param_importances(study)
+    ax.figure.savefig(
+        results_dir / f"param_importances_multi_head_{timestamp}.png", dpi=150
+    )
+    plt.close(ax.figure)
+    # Parallel coordinate plot
+    ax = plot_parallel_coordinate(study)
+    ax.figure.savefig(
+        results_dir / f"parallel_coordinate_multi_head_{timestamp}.png", dpi=150
+    )
+    plt.close(ax.figure)
+    LOGGER.info("Visualizations saved to: %s", results_dir)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/hyperparameter_search_multi_head_error_predictor.py ADDED Viewed

	@@ -0,0 +1,552 @@

+"""
+Hyperparameter search for Multi-Head Error Predictor using Optuna.
+Optimizes:
+- Learning rate
+- Batch size
+- Width multipliers for each component branch (hue, value, chroma, code)
+- Loss function component weights
+Objective: Minimize validation loss (combined base + error predictor)
+"""
+from __future__ import annotations
+import logging
+from datetime import datetime
+from pathlib import Path
+import matplotlib.pyplot as plt
+import mlflow
+import numpy as np
+import onnxruntime as ort
+import optuna
+import torch
+from numpy.typing import NDArray
+from optuna.trial import Trial
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import ComponentErrorPredictor
+from learning_munsell.utilities.common import setup_mlflow_experiment
+from learning_munsell.utilities.data import normalize_xyY, normalize_munsell
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+class MultiHeadErrorPredictorParametric(nn.Module):
+    """
+    Parametric Multi-Head error predictor with 4 independent branches.
+    This model consists of four independent ComponentErrorPredictor
+    networks, one for each Munsell component (hue, value, chroma, code).
+    Each branch can have different widths for hyperparameter optimization.
+    Parameters
+    ----------
+    hue_width : float, optional
+        Width multiplier for the hue branch. Default is 1.0.
+    value_width : float, optional
+        Width multiplier for the value branch. Default is 1.0.
+    chroma_width : float, optional
+        Width multiplier for the chroma branch. Default is 1.5.
+    code_width : float, optional
+        Width multiplier for the code branch. Default is 1.0.
+    """
+    def __init__(
+        self,
+        hue_width: float = 1.0,
+        value_width: float = 1.0,
+        chroma_width: float = 1.5,
+        code_width: float = 1.0,
+    ) -> None:
+        super().__init__()
+        # Independent error predictor for each component
+        self.hue_branch = ComponentErrorPredictor(width_multiplier=hue_width)
+        self.value_branch = ComponentErrorPredictor(
+            width_multiplier=value_width
+        )
+        self.chroma_branch = ComponentErrorPredictor(
+            width_multiplier=chroma_width
+        )
+        self.code_branch = ComponentErrorPredictor(
+            width_multiplier=code_width
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through all four error predictor branches.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, 7) containing normalized
+            xyY values and base model predictions.
+        Returns
+        -------
+        torch.Tensor
+            Predicted errors for all components, shape (batch_size, 4).
+            Output order: [hue_error, value_error, chroma_error, code_error].
+        """
+        # Each branch processes the same combined input independently
+        hue_error = self.hue_branch(x)
+        value_error = self.value_branch(x)
+        chroma_error = self.chroma_branch(x)
+        code_error = self.code_branch(x)
+        # Concatenate: [Hue_error, Value_error, Chroma_error, Code_error]
+        return torch.cat([hue_error, value_error, chroma_error, code_error], dim=1)
+def load_base_model(
+    model_path: Path, params_path: Path
+) -> tuple[ort.InferenceSession, dict, dict]:
+    """
+    Load the base Multi-Head ONNX model and its normalization parameters.
+    Parameters
+    ----------
+    model_path : Path
+        Path to the base Multi-Head model ONNX file.
+    params_path : Path
+        Path to the normalization parameters NPZ file.
+    Returns
+    -------
+    ort.InferenceSession
+        ONNX Runtime inference session for the base model.
+    dict
+        Input normalization parameters (x_range, y_range, Y_range).
+    dict
+        Output normalization parameters (hue_range, value_range, chroma_range, code_range).
+    """
+    session = ort.InferenceSession(str(model_path))
+    params = np.load(params_path, allow_pickle=True)
+    return session, params["input_params"].item(), params["output_params"].item()
+def create_weighted_loss(
+    mse_weight: float,
+    mae_weight: float,
+    log_weight: float,
+    huber_weight: float,
+    huber_delta: float,
+):
+    """
+    Create a weighted loss function combining multiple loss components.
+    Parameters
+    ----------
+    mse_weight : float
+        Weight for MSE component.
+    mae_weight : float
+        Weight for MAE component.
+    log_weight : float
+        Weight for logarithmic penalty component.
+    huber_weight : float
+        Weight for Huber loss component.
+    huber_delta : float
+        Delta parameter for Huber loss transition point.
+    Returns
+    -------
+    callable
+        Loss function that accepts (pred, target) and returns a scalar loss.
+    """
+    def weighted_loss(pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """
+        Compute weighted combination of loss components.
+        Parameters
+        ----------
+        pred : torch.Tensor
+            Predicted values, shape (batch_size, n_components).
+        target : torch.Tensor
+            Target values, shape (batch_size, n_components).
+        Returns
+        -------
+        torch.Tensor
+            Weighted combination of loss components, scalar tensor.
+        """
+        # Standard MSE
+        mse = torch.mean((pred - target) ** 2)
+        # Mean absolute error
+        mae = torch.mean(torch.abs(pred - target))
+        # Logarithmic penalty
+        log_penalty = torch.mean(torch.log1p(torch.abs(pred - target) * 1000.0))
+        # Huber loss
+        abs_error = torch.abs(pred - target)
+        huber = torch.where(
+            abs_error <= huber_delta,
+            0.5 * abs_error**2,
+            huber_delta * (abs_error - 0.5 * huber_delta),
+        )
+        huber_loss = torch.mean(huber)
+        # Combine with weights
+        return (
+            mse_weight * mse
+            + mae_weight * mae
+            + log_weight * log_penalty
+            + huber_weight * huber_loss
+        )
+    return weighted_loss
+def objective(trial: Trial) -> float:
+    """
+    Optuna objective function to minimize validation loss.
+    This function defines the hyperparameter search space and training
+    procedure for each trial. It optimizes:
+    - Learning rate (1e-4 to 1e-3, log scale)
+    - Batch size (512, 1024, or 2048)
+    - Width multipliers for each component branch
+    - Loss function weights (MSE, MAE, log penalty, Huber)
+    - Huber delta parameter (0.005 to 0.02)
+    Parameters
+    ----------
+    trial : Trial
+        Optuna trial object for suggesting hyperparameters.
+    Returns
+    -------
+    float
+        Best validation loss achieved during training.
+    Raises
+    ------
+    optuna.TrialPruned
+        If trial is pruned based on intermediate results.
+    """
+    # Suggest hyperparameters
+    lr = trial.suggest_float("lr", 1e-4, 1e-3, log=True)
+    batch_size = trial.suggest_categorical("batch_size", [512, 1024, 2048])
+    hue_width = trial.suggest_float("hue_width", 0.75, 1.5, step=0.25)
+    value_width = trial.suggest_float("value_width", 0.75, 1.5, step=0.25)
+    chroma_width = trial.suggest_float("chroma_width", 1.0, 2.0, step=0.25)
+    code_width = trial.suggest_float("code_width", 0.75, 1.5, step=0.25)
+    # Loss function weights
+    mse_weight = trial.suggest_float("mse_weight", 0.5, 2.0, step=0.5)
+    mae_weight = trial.suggest_float("mae_weight", 0.0, 1.0, step=0.25)
+    log_weight = trial.suggest_float("log_weight", 0.0, 0.5, step=0.1)
+    huber_weight = trial.suggest_float("huber_weight", 0.0, 1.0, step=0.25)
+    huber_delta = trial.suggest_float("huber_delta", 0.005, 0.02, step=0.005)
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Trial %d", trial.number)
+    LOGGER.info("=" * 80)
+    LOGGER.info("  lr: %.6f", lr)
+    LOGGER.info("  batch_size: %d", batch_size)
+    LOGGER.info("  hue_width: %.2f", hue_width)
+    LOGGER.info("  value_width: %.2f", value_width)
+    LOGGER.info("  chroma_width: %.2f", chroma_width)
+    LOGGER.info("  code_width: %.2f", code_width)
+    LOGGER.info("  mse_weight: %.2f", mse_weight)
+    LOGGER.info("  mae_weight: %.2f", mae_weight)
+    LOGGER.info("  log_weight: %.2f", log_weight)
+    LOGGER.info("  huber_weight: %.2f", huber_weight)
+    LOGGER.info("  huber_delta: %.3f", huber_delta)
+    # Set device
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    LOGGER.info("  device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    base_model_path = model_directory / "multi_head.onnx"
+    params_path = model_directory / "multi_head_normalization_params.npz"
+    cache_file = data_dir / "training_data.npz"
+    # Load base model
+    base_session, input_params, output_params = load_base_model(
+        base_model_path, params_path
+    )
+    # Load training data
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    # Normalize
+    X_train_norm = normalize_xyY(X_train, input_params)
+    y_train_norm = normalize_munsell(y_train, output_params)
+    X_val_norm = normalize_xyY(X_val, input_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Generate base model predictions
+    base_pred_train_norm = base_session.run(None, {"xyY": X_train_norm})[0]
+    base_pred_val_norm = base_session.run(None, {"xyY": X_val_norm})[0]
+    # Compute errors
+    error_train = y_train_norm - base_pred_train_norm
+    error_val = y_val_norm - base_pred_val_norm
+    # Create combined input: [xyY_norm, base_prediction_norm]
+    X_train_combined = np.concatenate([X_train_norm, base_pred_train_norm], axis=1)
+    X_val_combined = np.concatenate([X_val_norm, base_pred_val_norm], axis=1)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_combined)
+    error_train_t = torch.FloatTensor(error_train)
+    X_val_t = torch.FloatTensor(X_val_combined)
+    error_val_t = torch.FloatTensor(error_val)
+    # Create data loaders
+    train_loader = DataLoader(
+        TensorDataset(X_train_t, error_train_t), batch_size=batch_size, shuffle=True
+    )
+    val_loader = DataLoader(
+        TensorDataset(X_val_t, error_val_t), batch_size=batch_size, shuffle=False
+    )
+    LOGGER.info(
+        "  Training samples: %d, Validation samples: %d", len(X_train_t), len(X_val_t)
+    )
+    # Initialize error predictor model
+    model = MultiHeadErrorPredictorParametric(
+        hue_width=hue_width,
+        value_width=value_width,
+        chroma_width=chroma_width,
+        code_width=code_width,
+    ).to(device)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("  Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    # Create loss function
+    criterion = create_weighted_loss(
+        mse_weight, mae_weight, log_weight, huber_weight, huber_delta
+    )
+    # MLflow setup
+    run_name = setup_mlflow_experiment(
+        "from_xyY", f"hparam_multi_head_error_trial_{trial.number}"
+    )
+    # Training loop with early stopping
+    num_epochs = 50  # Reduced for hyperparameter search
+    patience = 10
+    best_val_loss = float("inf")
+    patience_counter = 0
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "lr": lr,
+                "batch_size": batch_size,
+                "hue_width": hue_width,
+                "value_width": value_width,
+                "chroma_width": chroma_width,
+                "code_width": code_width,
+                "mse_weight": mse_weight,
+                "mae_weight": mae_weight,
+                "log_weight": log_weight,
+                "huber_weight": huber_weight,
+                "huber_delta": huber_delta,
+                "total_params": total_params,
+                "trial_number": trial.number,
+            }
+        )
+        for epoch in range(num_epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            # Log to MLflow
+            mlflow.log_metrics(
+                {
+                    "train_loss": train_loss,
+                    "val_loss": val_loss,
+                    "learning_rate": optimizer.param_groups[0]["lr"],
+                },
+                step=epoch,
+            )
+            if (epoch + 1) % 10 == 0:
+                LOGGER.info(
+                    "  Epoch %03d/%d - Train: %.6f, Val: %.6f, LR: %.6f",
+                    epoch + 1,
+                    num_epochs,
+                    train_loss,
+                    val_loss,
+                    optimizer.param_groups[0]["lr"],
+                )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("  Early stopping at epoch %d", epoch + 1)
+                    break
+            # Report intermediate value for pruning
+            trial.report(val_loss, epoch)
+            # Handle pruning
+            if trial.should_prune():
+                LOGGER.info("  Trial pruned at epoch %d", epoch + 1)
+                mlflow.log_metrics({"pruned": 1, "pruned_epoch": epoch})
+                raise optuna.TrialPruned
+        # Log final results
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_train_loss": train_loss,
+            }
+        )
+    LOGGER.info("  Final validation loss: %.6f", best_val_loss)
+    return best_val_loss
+def main() -> None:
+    """
+    Run hyperparameter search for Multi-Head Error Predictor.
+    Performs systematic hyperparameter optimization using Optuna with:
+    - MedianPruner for early stopping of unpromising trials
+    - 30 total trials
+    - MLflow logging for each trial
+    - Result visualization using matplotlib (optimization history,
+      parameter importances, parallel coordinate plot)
+    The search aims to find optimal hyperparameters for predicting errors
+    in a base Multi-Head model, allowing for error correction and improved
+    Munsell predictions.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Head Error Predictor Hyperparameter Search with Optuna")
+    LOGGER.info("=" * 80)
+    # Create study
+    study = optuna.create_study(
+        direction="minimize",
+        study_name="multi_head_error_predictor_hparam_search",
+        pruner=optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=5),
+    )
+    # Run optimization
+    n_trials = 30  # Number of trials to run
+    LOGGER.info("")
+    LOGGER.info("Starting hyperparameter search with %d trials...", n_trials)
+    LOGGER.info("")
+    study.optimize(objective, n_trials=n_trials, timeout=None)
+    # Print results
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Hyperparameter Search Results")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Best trial:")
+    LOGGER.info("  Value (val_loss): %.6f", study.best_value)
+    LOGGER.info("")
+    LOGGER.info("Best hyperparameters:")
+    for key, value in study.best_params.items():
+        LOGGER.info("  %s: %s", key, value)
+    # Save results
+    results_dir = PROJECT_ROOT / "results" / "from_xyY"
+    results_dir.mkdir(exist_ok=True, parents=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_file = results_dir / f"hparam_search_multi_head_error_{timestamp}.txt"
+    with open(results_file, "w") as f:
+        f.write("=" * 80 + "\n")
+        f.write("Multi-Head Error Predictor Hyperparameter Search Results\n")
+        f.write("=" * 80 + "\n\n")
+        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Number of trials: {len(study.trials)}\n")
+        f.write(f"Best validation loss: {study.best_value:.6f}\n\n")
+        f.write("Best hyperparameters:\n")
+        for key, value in study.best_params.items():
+            f.write(f"  {key}: {value}\n")
+        f.write("\n\nAll trials:\n")
+        f.write("-" * 80 + "\n")
+        for t in study.trials:
+            f.write(f"\nTrial {t.number}:\n")
+            if t.value is not None:
+                f.write(f"  Value: {t.value:.6f}\n")
+            else:
+                f.write("  Value: Pruned\n")
+            f.write("  Params:\n")
+            for key, value in t.params.items():
+                f.write(f"    {key}: {value}\n")
+    LOGGER.info("")
+    LOGGER.info("Results saved to: %s", results_file)
+    # Generate visualizations using matplotlib
+    from optuna.visualization.matplotlib import (
+        plot_optimization_history,
+        plot_param_importances,
+        plot_parallel_coordinate,
+    )
+    # Optimization history
+    ax = plot_optimization_history(study)
+    ax.figure.savefig(
+        results_dir / f"optimization_history_multi_head_error_{timestamp}.png", dpi=150
+    )
+    plt.close(ax.figure)
+    # Parameter importances
+    ax = plot_param_importances(study)
+    ax.figure.savefig(
+        results_dir / f"param_importances_multi_head_error_{timestamp}.png", dpi=150
+    )
+    plt.close(ax.figure)
+    # Parallel coordinate plot
+    ax = plot_parallel_coordinate(study)
+    ax.figure.savefig(
+        results_dir / f"parallel_coordinate_multi_head_error_{timestamp}.png", dpi=150
+    )
+    plt.close(ax.figure)
+    LOGGER.info("Visualizations saved to: %s", results_dir)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/hyperparameter_search_multi_mlp.py ADDED Viewed

	@@ -0,0 +1,471 @@

+"""
+Hyperparameter search for Multi-MLP model using Optuna.
+Optimizes:
+- Learning rate
+- Batch size
+- Chroma width multiplier
+- Chroma loss weight
+- Code loss weight
+- Dropout (optional)
+Objective: Minimize validation loss
+"""
+import logging
+from datetime import datetime
+import matplotlib.pyplot as plt
+import mlflow
+import numpy as np
+import optuna
+import torch
+from numpy.typing import NDArray
+from optuna.trial import Trial
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MultiMLPToMunsell
+from learning_munsell.utilities.common import setup_mlflow_experiment
+from learning_munsell.utilities.data import MUNSELL_NORMALIZATION_PARAMS, normalize_munsell
+LOGGER = logging.getLogger(__name__)
+def weighted_mse_loss(
+    pred: torch.Tensor,
+    target: torch.Tensor,
+    hue_weight: float = 1.0,
+    value_weight: float = 1.0,
+    chroma_weight: float = 4.0,
+    code_weight: float = 0.5,
+) -> torch.Tensor:
+    """
+    Component-wise weighted MSE loss with configurable weights.
+    Applies different weights to each Munsell component to account for
+    varying prediction difficulty and importance.
+    Parameters
+    ----------
+    pred : torch.Tensor
+        Predicted values, shape (batch_size, 4).
+    target : torch.Tensor
+        Target values, shape (batch_size, 4).
+    hue_weight : float, optional
+        Weight for hue component. Default is 1.0.
+    value_weight : float, optional
+        Weight for value component. Default is 1.0.
+    chroma_weight : float, optional
+        Weight for chroma component (typically higher). Default is 4.0.
+    code_weight : float, optional
+        Weight for code component (typically lower). Default is 0.5.
+    Returns
+    -------
+    torch.Tensor
+        Weighted MSE loss, scalar tensor.
+    """
+    weights = torch.tensor(
+        [hue_weight, value_weight, chroma_weight, code_weight], device=pred.device
+    )
+    mse = (pred - target) ** 2
+    weighted_mse = mse * weights
+    return weighted_mse.mean()
+def train_epoch(
+    model: nn.Module,
+    dataloader: DataLoader,
+    optimizer: optim.Optimizer,
+    device: torch.device,
+    chroma_weight: float,
+    code_weight: float,
+) -> float:
+    """
+    Train the model for one epoch.
+    Parameters
+    ----------
+    model : nn.Module
+        Multi-MLP model to train.
+    dataloader : DataLoader
+        DataLoader providing training batches.
+    optimizer : optim.Optimizer
+        Optimizer for updating model parameters.
+    device : torch.device
+        Device to run training on (CPU, CUDA, or MPS).
+    chroma_weight : float
+        Weight for chroma component in loss function.
+    code_weight : float
+        Weight for code component in loss function.
+    Returns
+    -------
+    float
+        Average training loss over the epoch.
+    """
+    model.train()
+    total_loss = 0.0
+    for X_batch, y_batch in dataloader:
+        X_batch = X_batch.to(device)
+        y_batch = y_batch.to(device)
+        # Forward pass
+        outputs = model(X_batch)
+        loss = weighted_mse_loss(
+            outputs, y_batch, chroma_weight=chroma_weight, code_weight=code_weight
+        )
+        # Backward pass
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(dataloader)
+def validate(
+    model: nn.Module,
+    dataloader: DataLoader,
+    device: torch.device,
+    chroma_weight: float,
+    code_weight: float,
+) -> float:
+    """
+    Validate the model on the validation set.
+    Parameters
+    ----------
+    model : nn.Module
+        Multi-MLP model to validate.
+    dataloader : DataLoader
+        DataLoader providing validation batches.
+    device : torch.device
+        Device to run validation on (CPU, CUDA, or MPS).
+    chroma_weight : float
+        Weight for chroma component in loss function.
+    code_weight : float
+        Weight for code component in loss function.
+    Returns
+    -------
+    float
+        Average validation loss.
+    """
+    model.eval()
+    total_loss = 0.0
+    with torch.no_grad():
+        for X_batch, y_batch in dataloader:
+            X_batch = X_batch.to(device)
+            y_batch = y_batch.to(device)
+            outputs = model(X_batch)
+            loss = weighted_mse_loss(
+                outputs, y_batch, chroma_weight=chroma_weight, code_weight=code_weight
+            )
+            total_loss += loss.item()
+    return total_loss / len(dataloader)
+def objective(trial: Trial) -> float:
+    """
+    Optuna objective function to minimize validation loss.
+    This function defines the hyperparameter search space and training
+    procedure for each trial. It optimizes:
+    - Learning rate (1e-4 to 1e-3, log scale)
+    - Batch size (512, 1024, or 2048)
+    - Chroma branch width multiplier (1.5 to 2.5)
+    - Chroma loss weight (3.0 to 6.0)
+    - Code loss weight (0.3 to 1.0)
+    - Dropout rate (0.0 to 0.2)
+    Parameters
+    ----------
+    trial : Trial
+        Optuna trial object for suggesting hyperparameters.
+    Returns
+    -------
+    float
+        Best validation loss achieved during training.
+    Raises
+    ------
+    FileNotFoundError
+        If training data file is not found.
+    optuna.TrialPruned
+        If trial is pruned based on intermediate results.
+    """
+    # Suggest hyperparameters
+    lr = trial.suggest_float("lr", 1e-4, 1e-3, log=True)
+    batch_size = trial.suggest_categorical("batch_size", [512, 1024, 2048])
+    chroma_width = trial.suggest_float("chroma_width", 1.5, 2.5, step=0.25)
+    chroma_weight = trial.suggest_float("chroma_weight", 3.0, 6.0, step=0.5)
+    code_weight = trial.suggest_float("code_weight", 0.3, 1.0, step=0.1)
+    dropout = trial.suggest_float("dropout", 0.0, 0.2, step=0.05)
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Trial %d", trial.number)
+    LOGGER.info("=" * 80)
+    LOGGER.info("  lr: %.6f", lr)
+    LOGGER.info("  batch_size: %d", batch_size)
+    LOGGER.info("  chroma_width: %.2f", chroma_width)
+    LOGGER.info("  chroma_weight: %.1f", chroma_weight)
+    LOGGER.info("  code_weight: %.1f", code_weight)
+    LOGGER.info("  dropout: %.2f", dropout)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load training data
+    data_file = PROJECT_ROOT / "data" / "training_data.npz"
+    if not data_file.exists():
+        LOGGER.error("Training data not found at %s", data_file)
+        LOGGER.error("Run generate_training_data.py first")
+        msg = f"Training data not found: {data_file}"
+        raise FileNotFoundError(msg)
+    data = np.load(data_file)
+    # Use pre-split data
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info(
+        "Loaded %d training samples, %d validation samples", len(X_train), len(X_val)
+    )
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train = normalize_munsell(y_train, output_params)
+    y_val = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MultiMLPToMunsell(
+        chroma_width_multiplier=chroma_width, dropout=dropout
+    ).to(device)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    # MLflow setup
+    run_name = setup_mlflow_experiment(
+        "from_xyY", f"hparam_multi_mlp_trial_{trial.number}"
+    )
+    # Training loop with early stopping
+    num_epochs = 100  # Reduced for hyperparameter search
+    patience = 15
+    best_val_loss = float("inf")
+    patience_counter = 0
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "trial": trial.number,
+                "lr": lr,
+                "batch_size": batch_size,
+                "chroma_width": chroma_width,
+                "chroma_weight": chroma_weight,
+                "code_weight": code_weight,
+                "dropout": dropout,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(num_epochs):
+            train_loss = train_epoch(
+                model, train_loader, optimizer, device, chroma_weight, code_weight
+            )
+            val_loss = validate(model, val_loader, device, chroma_weight, code_weight)
+            # Log to MLflow
+            mlflow.log_metrics(
+                {
+                    "train_loss": train_loss,
+                    "val_loss": val_loss,
+                    "learning_rate": lr,
+                },
+                step=epoch,
+            )
+            if (epoch + 1) % 10 == 0:
+                LOGGER.info(
+                    "  Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f",
+                    epoch + 1,
+                    num_epochs,
+                    train_loss,
+                    val_loss,
+                )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("  Early stopping at epoch %d", epoch + 1)
+                    break
+            # Report intermediate value for pruning
+            trial.report(val_loss, epoch)
+            # Handle pruning
+            if trial.should_prune():
+                LOGGER.info("  Trial pruned at epoch %d", epoch + 1)
+                mlflow.log_metrics({"pruned": 1, "pruned_epoch": epoch})
+                raise optuna.TrialPruned
+        # Log final results
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_train_loss": train_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+    LOGGER.info("  Final validation loss: %.6f", best_val_loss)
+    return best_val_loss
+def main() -> None:
+    """
+    Run hyperparameter search for Multi-MLP model.
+    Performs systematic hyperparameter optimization using Optuna with:
+    - MedianPruner for early stopping of unpromising trials
+    - 15 total trials
+    - MLflow logging for each trial
+    - Result visualization using matplotlib (optimization history,
+      parameter importances, parallel coordinate plot)
+    The search aims to find optimal hyperparameters for converting xyY
+    color coordinates to Munsell color specifications using a multi-MLP
+    architecture with independent branches for each component.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-MLP Hyperparameter Search with Optuna")
+    LOGGER.info("=" * 80)
+    # Create study
+    study = optuna.create_study(
+        direction="minimize",
+        study_name="multi_mlp_hparam_search",
+        pruner=optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=10),
+    )
+    # Run optimization
+    n_trials = 15  # Number of trials to run
+    LOGGER.info("")
+    LOGGER.info("Starting hyperparameter search with %d trials...", n_trials)
+    LOGGER.info("")
+    study.optimize(objective, n_trials=n_trials, timeout=None)
+    # Print results
+    LOGGER.info("")
+    LOGGER.info("=" * 80)
+    LOGGER.info("Hyperparameter Search Results")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Best trial:")
+    LOGGER.info("  Value (val_loss): %.6f", study.best_value)
+    LOGGER.info("")
+    LOGGER.info("Best hyperparameters:")
+    for key, value in study.best_params.items():
+        LOGGER.info("  %s: %s", key, value)
+    # Save results
+    results_dir = PROJECT_ROOT / "results" / "from_xyY"
+    results_dir.mkdir(exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_file = results_dir / f"hparam_search_{timestamp}.txt"
+    with open(results_file, "w") as f:
+        f.write("=" * 80 + "\n")
+        f.write("Multi-MLP Hyperparameter Search Results\n")
+        f.write("=" * 80 + "\n\n")
+        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Number of trials: {len(study.trials)}\n")
+        f.write(f"Best validation loss: {study.best_value:.6f}\n\n")
+        f.write("Best hyperparameters:\n")
+        for key, value in study.best_params.items():
+            f.write(f"  {key}: {value}\n")
+        f.write("\n\nAll trials:\n")
+        f.write("-" * 80 + "\n")
+        for trial in study.trials:
+            f.write(f"\nTrial {trial.number}:\n")
+            f.write(f"  Value: {trial.value:.6f if trial.value else 'Pruned'}\n")
+            f.write("  Params:\n")
+            for key, value in trial.params.items():
+                f.write(f"    {key}: {value}\n")
+    LOGGER.info("")
+    LOGGER.info("Results saved to: %s", results_file)
+    # Generate visualizations using matplotlib
+    from optuna.visualization.matplotlib import (
+        plot_optimization_history,
+        plot_param_importances,
+        plot_parallel_coordinate,
+    )
+    # Optimization history
+    ax = plot_optimization_history(study)
+    ax.figure.savefig(results_dir / f"optimization_history_{timestamp}.png", dpi=150)
+    plt.close(ax.figure)
+    # Parameter importances
+    ax = plot_param_importances(study)
+    ax.figure.savefig(results_dir / f"param_importances_{timestamp}.png", dpi=150)
+    plt.close(ax.figure)
+    # Parallel coordinate plot
+    ax = plot_parallel_coordinate(study)
+    ax.figure.savefig(results_dir / f"parallel_coordinate_{timestamp}.png", dpi=150)
+    plt.close(ax.figure)
+    LOGGER.info("Visualizations saved to: %s", results_dir)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/refine_multi_head_real.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""
+Refine Multi-Head model on REAL Munsell colors only.
+This script fine-tunes the best Multi-Head model using only the 2734 real
+(measured) Munsell colors, which should improve accuracy on the evaluation set.
+"""
+import logging
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_REAL
+from colour.notation.munsell import (
+    munsell_colour_to_munsell_specification,
+    munsell_specification_to_xyY,
+)
+from numpy.typing import NDArray
+from sklearn.model_selection import train_test_split
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MultiHeadMLPToMunsell
+from learning_munsell.utilities.common import (
+    log_training_epoch,
+    setup_mlflow_experiment,
+)
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+def generate_real_samples(
+    n_samples_per_color: int = 100,
+    perturbation_pct: float = 0.05,
+) -> tuple[NDArray, NDArray]:
+    """
+    Generate training samples from REAL (measured) Munsell colors only.
+    Creates augmented samples by applying small perturbations to the 2734 real
+    Munsell color specifications to increase training data while staying close
+    to measured values.
+    Parameters
+    ----------
+    n_samples_per_color : int, optional
+        Number of perturbed samples to generate per real color (default is 100).
+    perturbation_pct : float, optional
+        Percentage of range to use for perturbations (default is 0.05 = 5%).
+    Returns
+    -------
+    xyY_samples : NDArray
+        Array of shape (n_samples, 3) containing xyY coordinates.
+    munsell_samples : NDArray
+        Array of shape (n_samples, 4) containing Munsell specifications
+        [hue, value, chroma, code].
+    Notes
+    -----
+    Perturbations are applied uniformly within ±perturbation_pct of the
+    component ranges:
+    - Hue range: 9.5 (0.5 to 10.0)
+    - Value range: 9.0 (1.0 to 10.0)
+    - Chroma range: 50.0 (0.0 to 50.0)
+    Invalid samples (that cannot be converted to xyY) are skipped.
+    """
+    LOGGER.info(
+        "Generating samples from %d REAL Munsell colors...", len(MUNSELL_COLOURS_REAL)
+    )
+    np.random.seed(42)
+    hue_range = 9.5
+    value_range = 9.0
+    chroma_range = 50.0
+    xyY_samples = []
+    munsell_samples = []
+    for munsell_spec_tuple, _ in MUNSELL_COLOURS_REAL:
+        hue_code_str, value, chroma = munsell_spec_tuple
+        munsell_str = f"{hue_code_str} {value}/{chroma}"
+        base_spec = munsell_colour_to_munsell_specification(munsell_str)
+        for _ in range(n_samples_per_color):
+            hue_delta = np.random.uniform(
+                -perturbation_pct * hue_range, perturbation_pct * hue_range
+            )
+            value_delta = np.random.uniform(
+                -perturbation_pct * value_range, perturbation_pct * value_range
+            )
+            chroma_delta = np.random.uniform(
+                -perturbation_pct * chroma_range, perturbation_pct * chroma_range
+            )
+            perturbed_spec = base_spec.copy()
+            perturbed_spec[0] = np.clip(base_spec[0] + hue_delta, 0.5, 10.0)
+            perturbed_spec[1] = np.clip(base_spec[1] + value_delta, 1.0, 10.0)
+            perturbed_spec[2] = np.clip(base_spec[2] + chroma_delta, 0.0, 50.0)
+            try:
+                xyY = munsell_specification_to_xyY(perturbed_spec)
+                xyY_samples.append(xyY)
+                munsell_samples.append(perturbed_spec)
+            except Exception:
+                continue
+    LOGGER.info("Generated %d samples", len(xyY_samples))
+    return np.array(xyY_samples), np.array(munsell_samples)
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=512, help="Batch size for training")
+@click.option("--lr", default=1e-5, help="Learning rate")
+@click.option("--patience", default=30, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Refine Multi-Head model on REAL Munsell colors only.
+    Fine-tunes a pretrained Multi-Head MLP model using only the 2734 real
+    (measured) Munsell colors with small perturbations. This refinement step
+    aims to improve accuracy on actual measured colors by focusing the model
+    on the real color gamut.
+    Notes
+    -----
+    Training configuration:
+    - Dataset: 2734 real Munsell colors with 200 samples per color
+    - Perturbation: 3% of component ranges (smaller than initial training)
+    - Learning rate: 1e-5 (lower for fine-tuning)
+    - Batch size: 512
+    - Early stopping: patience of 30 epochs
+    - Optimizer: AdamW with weight decay 0.01
+    - Scheduler: ReduceLROnPlateau with factor 0.5, patience 15
+    Workflow:
+    1. Generate augmented samples from real Munsell colors
+    2. Load pretrained model (multi_head_large_best.pth)
+    3. Fine-tune with lower learning rate
+    4. Save best model based on validation loss
+    5. Export to ONNX format
+    6. Log metrics to MLflow
+    Files generated:
+    - multi_head_refined_real_best.pth: Best checkpoint
+    - multi_head_refined_real.onnx: ONNX model
+    - multi_head_refined_real_normalization_params.npz: Normalization params
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Head Refinement on REAL Munsell Colors")
+    LOGGER.info("=" * 80)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    LOGGER.info("Using device: %s", device)
+    # Generate REAL-only samples
+    LOGGER.info("")
+    xyY_all, munsell_all = generate_real_samples(
+        n_samples_per_color=200,  # 200 samples per real color
+        perturbation_pct=0.03,  # Smaller perturbations for refinement
+    )
+    # Split data
+    X_train, X_val, y_train, y_val = train_test_split(
+        xyY_all, munsell_all, test_size=0.15, random_state=42
+    )
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    # Use hardcoded ranges covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Load pretrained model
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    pretrained_path = model_directory / "multi_head_large_best.pth"
+    model = MultiHeadMLPToMunsell().to(device)
+    if pretrained_path.exists():
+        LOGGER.info("")
+        LOGGER.info("Loading pretrained model from %s...", pretrained_path)
+        checkpoint = torch.load(
+            pretrained_path, weights_only=False, map_location=device
+        )
+        model.load_state_dict(checkpoint["model_state_dict"])
+        LOGGER.info("Pretrained model loaded successfully")
+    else:
+        LOGGER.info("")
+        LOGGER.info("No pretrained model found, training from scratch")
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Fine-tuning with lower learning rate
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=15
+    )
+    criterion = nn.MSELoss()
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "multi_head_refined_real")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    LOGGER.info("Learning rate: %e (fine-tuning)", lr)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting refinement training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "multi_head_refined_real",
+                "learning_rate": lr,
+                "batch_size": batch_size,
+                "num_epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+                "train_samples": len(X_train),
+                "val_samples": len(X_val),
+                "dataset": "REAL_only",
+                "perturbation_pct": 0.03,
+                "samples_per_color": 200,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.2e",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                checkpoint_file = model_directory / "multi_head_refined_real_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "output_params": output_params,
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  -> Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting refined model to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file, weights_only=False)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        model_cpu = model.cpu()
+        dummy_input = torch.randn(1, 3)
+        onnx_file = model_directory / "multi_head_refined_real.onnx"
+        torch.onnx.export(
+            model_cpu,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=14,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY": {0: "batch_size"}, "munsell_spec": {0: "batch_size"}},
+        )
+        params_file = (
+            model_directory / "multi_head_refined_real_normalization_params.npz"
+        )
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization params saved to: %s", params_file)
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_deep_wide.py ADDED Viewed

	@@ -0,0 +1,371 @@

+"""
+Train Deep + Wide model for xyY to Munsell conversion.
+Option 5: Hybrid Deep + Wide architecture
+- Input: 3 features (xyY)
+- Deep path: 3 → 512 → 1024 (ResBlocks) → 512
+- Wide path: 3 → 128 (direct linear)
+- Combine: [512, 128] → 256 → 4
+- Output: 4 features (hue, value, chroma, code)
+"""
+import logging
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import ResidualBlock
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import precision_focused_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+class DeepWideNet(nn.Module):
+    """
+    Deep + Wide Network for xyY to Munsell conversion.
+    Architecture:
+    - Deep path: Complex non-linear transformation
+    - Wide path: Direct linear connections
+    - Combines both for final prediction
+    Parameters
+    ----------
+    num_residual_blocks : int, optional
+        Number of residual blocks in deep path. Default is 4.
+    Attributes
+    ----------
+    deep_encoder : nn.Sequential
+        Deep path encoder: 3 → 512 → 1024.
+    deep_residual_blocks : nn.ModuleList
+        Stack of residual blocks in deep path.
+    deep_decoder : nn.Sequential
+        Deep path decoder: 1024 → 512.
+    wide_path : nn.Sequential
+        Wide path: 3 → 128.
+    output_head : nn.Sequential
+        Combined output: [512, 128] → 256 → 4.
+    Notes
+    -----
+    Hybrid architecture inspired by Google's Wide & Deep Learning:
+    - Deep path: 3 → 512 → 1024 → (ResBlocks) → 512
+    - Wide path: 3 → 128 (direct linear transformation)
+    - Combined: Concatenate [512, 128] → 256 → 4
+    The deep path learns complex non-linear transformations while the
+    wide path provides direct linear connections to preserve simple
+    relationships. Both paths are concatenated before the final output.
+    """
+    def __init__(self, num_residual_blocks: int = 4) -> None:
+        """Initialize the deep and wide network."""
+        super().__init__()
+        # Deep path: Complex transformation
+        self.deep_encoder = nn.Sequential(
+            nn.Linear(3, 512),
+            nn.GELU(),
+            nn.BatchNorm1d(512),
+            nn.Linear(512, 1024),
+            nn.GELU(),
+            nn.BatchNorm1d(1024),
+        )
+        self.deep_residual_blocks = nn.ModuleList(
+            [ResidualBlock(1024) for _ in range(num_residual_blocks)]
+        )
+        self.deep_decoder = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.GELU(),
+            nn.BatchNorm1d(512),
+        )
+        # Wide path: Direct linear transformation
+        self.wide_path = nn.Sequential(
+            nn.Linear(3, 128),
+            nn.GELU(),
+            nn.BatchNorm1d(128),
+        )
+        # Combined output: Concatenate deep (512) + wide (128) = 640
+        self.output_head = nn.Sequential(
+            nn.Linear(640, 256),
+            nn.GELU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 4),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through deep and wide paths.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (batch_size, 3) containing normalized xyY values.
+        Returns
+        -------
+        Tensor
+            Output tensor of shape (batch_size, 4) containing normalized Munsell
+            specifications [hue, value, chroma, code].
+        Notes
+        -----
+        The forward pass processes input through two parallel paths:
+        1. Deep path: Complex transformation through encoder, residual blocks,
+           and decoder (3 → 512 → 1024 → 512)
+        2. Wide path: Direct linear transformation (3 → 128)
+        3. Concatenation: Combine deep (512) + wide (128) = 640 features
+        4. Output head: Final transformation to 4 components (640 → 256 → 4)
+        """
+        # Deep path
+        deep = self.deep_encoder(x)
+        for block in self.deep_residual_blocks:
+            deep = block(deep)
+        deep = self.deep_decoder(deep)
+        # Wide path
+        wide = self.wide_path(x)
+        # Concatenate and output
+        combined = torch.cat([deep, wide], dim=1)
+        return self.output_head(combined)
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=3e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train the DeepWideNet model for xyY to Munsell conversion.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads normalization parameters from existing config
+    2. Loads training data from cache
+    3. Normalizes inputs and outputs to [0, 1] range
+    4. Creates PyTorch DataLoaders
+    5. Initializes DeepWideNet with deep and wide paths
+    6. Trains with AdamW optimizer and precision-focused loss
+    7. Uses learning rate scheduler (ReduceLROnPlateau)
+    8. Implements early stopping based on validation loss
+    9. Exports best model to ONNX format
+    10. Logs all metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Deep + Wide Network: xyY → Munsell")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    # Load training data
+    LOGGER.info("")
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    # Use hardcoded ranges covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = DeepWideNet(num_residual_blocks=4).to(device)
+    LOGGER.info("")
+    LOGGER.info("Deep + Wide architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    learning_rate = lr
+    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    criterion = precision_focused_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "deep_wide")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        # Log parameters
+        mlflow.log_params(
+            {
+                "model": "deep_wide",
+                "learning_rate": learning_rate,
+                "batch_size": batch_size,
+                "num_epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            # Log to MLflow
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "deep_wide_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        # Log final metrics
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 3).to(device)
+        onnx_file = model_directory / "deep_wide.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={
+                "xyY": {0: "batch_size"},
+                "munsell_spec": {0: "batch_size"},
+            },
+        )
+        # Save normalization parameters alongside model
+        params_file = model_directory / "deep_wide_normalization_params.npz"
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        # Log artifacts to MLflow
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_ft_transformer.py ADDED Viewed

	@@ -0,0 +1,356 @@

+"""
+Train FT-Transformer model for xyY to Munsell conversion.
+Option 4: Feature Tokenizer + Transformer architecture
+- Input: 3 features (xyY) → each becomes a 256-dim token
+- Add [CLS] token for regression
+- 4-6 transformer blocks with multi-head attention
+- Output: Take [CLS] token → MLP → 4 features
+"""
+import logging
+import click
+from typing import Any
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import FeatureTokenizer, TransformerBlock
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import precision_focused_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+class FTTransformer(nn.Module):
+    """
+    Feature Tokenizer + Transformer for xyY to Munsell conversion.
+    This model adapts transformer architecture for tabular data by tokenizing
+    each input feature separately and using self-attention to capture complex
+    feature interactions.
+    Architecture
+    ------------
+    - Tokenize each feature (3 features → 3 tokens)
+    - Add CLS token (4 tokens total)
+    - 4 transformer blocks with multi-head attention
+    - Extract CLS token → MLP head → 4 outputs
+    Parameters
+    ----------
+    num_features : int, optional
+        Number of input features (xyY), default is 3.
+    embedding_dim : int, optional
+        Dimension of token embeddings, default is 256.
+    num_blocks : int, optional
+        Number of transformer blocks, default is 4.
+    num_heads : int, optional
+        Number of attention heads, default is 4.
+    ff_dim : int, optional
+        Feedforward network hidden dimension, default is 512.
+    dropout : float, optional
+        Dropout probability, default is 0.1.
+    Attributes
+    ----------
+    tokenizer : FeatureTokenizer
+        Converts input features to token embeddings.
+    transformer_blocks : nn.ModuleList
+        Stack of transformer blocks.
+    output_head : nn.Sequential
+        MLP that maps CLS token to output predictions.
+    """
+    def __init__(
+        self,
+        num_features: int = 3,
+        embedding_dim: int = 256,
+        num_blocks: int = 4,
+        num_heads: int = 4,
+        ff_dim: int = 512,
+        dropout: float = 0.1,
+    ) -> None:
+        """Initialize the FT-Transformer model."""
+        super().__init__()
+        # Feature tokenizer
+        self.tokenizer = FeatureTokenizer(num_features, embedding_dim)
+        # Transformer blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(embedding_dim, num_heads, ff_dim, dropout)
+                for _ in range(num_blocks)
+            ]
+        )
+        # Output head (from CLS token)
+        self.output_head = nn.Sequential(
+            nn.Linear(embedding_dim, 128),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, 4),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through FT-Transformer.
+        Parameters
+        ----------
+        x : Tensor
+            Input xyY values of shape (batch_size, 3).
+        Returns
+        -------
+        Tensor
+            Predicted Munsell specification [hue, value, chroma, code]
+            of shape (batch_size, 4).
+        """
+        # Tokenize features
+        tokens = self.tokenizer(x)  # (batch_size, 1+num_features, embedding_dim)
+        # Transformer blocks
+        for block in self.transformer_blocks:
+            tokens = block(tokens)
+        # Extract CLS token (first token)
+        cls_token = tokens[:, 0, :]  # (batch_size, embedding_dim)
+        # Output head
+        return self.output_head(cls_token)
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=3e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train FT-Transformer model for xyY to Munsell conversion.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads normalization parameters from existing config
+    2. Loads training data from cache
+    3. Normalizes inputs and outputs to [0, 1] range
+    4. Creates PyTorch DataLoaders
+    5. Initializes FT-Transformer with feature tokenization
+    6. Trains with AdamW optimizer and precision-focused loss
+    7. Uses learning rate scheduler (ReduceLROnPlateau)
+    8. Implements early stopping based on validation loss
+    9. Exports best model to ONNX format
+    10. Logs all metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("FT-Transformer: xyY → Munsell")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    # Load training data
+    LOGGER.info("")
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = FTTransformer(
+        num_features=3,
+        embedding_dim=256,
+        num_blocks=4,
+        num_heads=4,
+        ff_dim=512,
+        dropout=0.1,
+    ).to(device)
+    LOGGER.info("")
+    LOGGER.info("FT-Transformer architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    criterion = precision_focused_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "ft_transformer")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "ft_transformer",
+                "learning_rate": lr,
+                "batch_size": batch_size,
+                "num_epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "ft_transformer_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 3).to(device)
+        onnx_file = model_directory / "ft_transformer.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={
+                "xyY": {0: "batch_size"},
+                "munsell_spec": {0: "batch_size"},
+            },
+        )
+        # Save normalization parameters alongside model
+        params_file = model_directory / "ft_transformer_normalization_params.npz"
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_mixture_of_experts.py ADDED Viewed

	@@ -0,0 +1,620 @@

+"""
+Train Mixture of Experts model for xyY to Munsell conversion.
+Option 6: Mixture of Experts architecture
+- Input: 3 features (xyY)
+- Gating network: 3 → 128 → 64 → 4 (softmax weights)
+- 4 Expert networks: Each 3 → 256 → 256 → 4 (MLP)
+- Output: Weighted combination of expert outputs
+"""
+import logging
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import ResidualBlock
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+LOGGER = logging.getLogger(__name__)
+class ExpertNetwork(nn.Module):
+    """
+    Single expert network with MLP architecture.
+    Each expert is a specialized neural network that learns to handle
+    specific regions of the input space. Uses residual connections for
+    improved gradient flow.
+    Architecture
+    ------------
+    - Encoder: 3 → 256 with GELU and BatchNorm
+    - Residual blocks: Configurable number of ResidualBlock(256)
+    - Decoder: 256 → 4
+    Parameters
+    ----------
+    num_residual_blocks : int, optional
+        Number of residual blocks, default is 2.
+    Attributes
+    ----------
+    encoder : nn.Sequential
+        Input encoding layer.
+    residual_blocks : nn.ModuleList
+        Stack of residual blocks.
+    decoder : nn.Sequential
+        Output decoding layer.
+    """
+    def __init__(self, num_residual_blocks: int = 2) -> None:
+        """Initialize the expert network."""
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(3, 256),
+            nn.GELU(),
+            nn.BatchNorm1d(256),
+        )
+        self.residual_blocks = nn.ModuleList(
+            [ResidualBlock(256) for _ in range(num_residual_blocks)]
+        )
+        self.decoder = nn.Sequential(
+            nn.Linear(256, 4),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through expert network.
+        Parameters
+        ----------
+        x : Tensor
+            Input xyY values of shape (batch_size, 3).
+        Returns
+        -------
+        Tensor
+            Expert's prediction of shape (batch_size, 4).
+        """
+        x = self.encoder(x)
+        for block in self.residual_blocks:
+            x = block(x)
+        return self.decoder(x)
+class GatingNetwork(nn.Module):
+    """
+    Gating network to compute expert weights.
+    Learns to route inputs to appropriate experts by outputting a probability
+    distribution over all experts. Different inputs activate different experts
+    based on learned input characteristics.
+    Architecture
+    ------------
+    3 → 128 → 64 → num_experts → softmax
+    Parameters
+    ----------
+    num_experts : int
+        Number of expert networks to gate.
+    Attributes
+    ----------
+    gate : nn.Sequential
+        MLP that maps inputs to expert logits.
+    """
+    def __init__(self, num_experts: int) -> None:
+        """Initialize the gating network."""
+        super().__init__()
+        self.gate = nn.Sequential(
+            nn.Linear(3, 128),
+            nn.GELU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 64),
+            nn.GELU(),
+            nn.BatchNorm1d(64),
+            nn.Linear(64, num_experts),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Compute expert weights for input.
+        Parameters
+        ----------
+        x : Tensor
+            Input xyY values of shape (batch_size, 3).
+        Returns
+        -------
+        Tensor
+            Softmax weights over experts of shape (batch_size, num_experts).
+            Weights sum to 1 along expert dimension.
+        """
+        # Output softmax weights for each expert
+        return torch.softmax(self.gate(x), dim=-1)
+class MixtureOfExperts(nn.Module):
+    """
+    Mixture of Experts for xyY to Munsell conversion.
+    Implements a mixture of experts architecture where multiple specialized
+    neural networks (experts) are combined via learned gating weights. This
+    allows different experts to specialize in different regions of the input
+    space (e.g., different color ranges or hue families).
+    Architecture
+    ------------
+    - Gating network: Learns which expert(s) to use for each input
+    - Multiple expert networks: Each specializes in different input regions
+    - Output: Weighted combination of expert predictions based on gate weights
+    - Load balancing: Auxiliary loss encourages balanced expert usage
+    Parameters
+    ----------
+    num_experts : int, optional
+        Number of expert networks, default is 4.
+    num_residual_blocks : int, optional
+        Number of residual blocks per expert, default is 2.
+    Attributes
+    ----------
+    num_experts : int
+        Number of expert networks.
+    gating_network : GatingNetwork
+        Network that computes expert weights.
+    experts : nn.ModuleList
+        List of expert networks.
+    load_balance_weight : float
+        Weight for load balancing auxiliary loss.
+    """
+    def __init__(self, num_experts: int = 4, num_residual_blocks: int = 2) -> None:
+        """Initialize the mixture of experts model."""
+        super().__init__()
+        self.num_experts = num_experts
+        # Gating network
+        self.gating_network = GatingNetwork(num_experts)
+        # Expert networks
+        self.experts = nn.ModuleList(
+            [ExpertNetwork(num_residual_blocks) for _ in range(num_experts)]
+        )
+        # Load balancing loss weight
+        self.load_balance_weight = 0.01
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass through mixture of experts.
+        Parameters
+        ----------
+        x : Tensor
+            Input xyY values of shape (batch_size, 3).
+        Returns
+        -------
+        tuple
+            (output, gate_weights) where:
+            - output: Weighted expert predictions of shape (batch_size, 4)
+            - gate_weights: Expert weights of shape (batch_size, num_experts)
+        """
+        # Get gating weights
+        gate_weights = self.gating_network(x)  # (batch_size, num_experts)
+        # Get expert outputs
+        expert_outputs = torch.stack(
+            [expert(x) for expert in self.experts], dim=1
+        )  # (batch_size, num_experts, 4)
+        # Weighted combination
+        gate_weights_expanded = gate_weights.unsqueeze(
+            -1
+        )  # (batch_size, num_experts, 1)
+        output = torch.sum(
+            expert_outputs * gate_weights_expanded, dim=1
+        )  # (batch_size, 4)
+        return output, gate_weights
+def precision_focused_loss(
+    pred: torch.Tensor,
+    target: torch.Tensor,
+    gate_weights: torch.Tensor,
+    load_balance_weight: float = 0.01,
+) -> torch.Tensor:
+    """
+    Precision-focused loss function with load balancing for mixture of experts.
+    Combines standard regression losses (MSE, MAE, log penalty, Huber) with
+    a load balancing auxiliary loss that encourages uniform expert usage across
+    the dataset to prevent expert collapse.
+    Parameters
+    ----------
+    pred : torch.Tensor
+        Predicted values.
+    target : torch.Tensor
+        Target ground truth values.
+    gate_weights : torch.Tensor
+        Expert gating weights of shape (batch_size, num_experts).
+    load_balance_weight : float, optional
+        Weight for load balancing auxiliary loss, default is 0.01.
+    Returns
+    -------
+    torch.Tensor
+        Combined loss value including load balancing term.
+    Notes
+    -----
+    The load balancing loss encourages each expert to handle roughly
+    1/num_experts of the data, preventing scenarios where only a few
+    experts are used while others remain idle.
+    """
+    # Standard precision loss
+    mse = torch.mean((pred - target) ** 2)
+    mae = torch.mean(torch.abs(pred - target))
+    log_penalty = torch.mean(torch.log1p(torch.abs(pred - target) * 1000.0))
+    delta = 0.01
+    abs_error = torch.abs(pred - target)
+    huber = torch.where(
+        abs_error <= delta, 0.5 * abs_error**2, delta * (abs_error - 0.5 * delta)
+    )
+    huber_loss = torch.mean(huber)
+    # Load balancing loss: Encourage balanced expert usage
+    # Compute importance (sum of gate weights per expert)
+    importance = gate_weights.sum(dim=0)  # (num_experts,)
+    # Normalize to probabilities
+    importance = importance / importance.sum()
+    # Encourage uniform distribution (1/num_experts for each)
+    num_experts = gate_weights.size(1)
+    target_importance = torch.ones_like(importance) / num_experts
+    load_balance_loss = torch.mean((importance - target_importance) ** 2)
+    return (
+        1.0 * mse
+        + 0.5 * mae
+        + 0.3 * log_penalty
+        + 0.5 * huber_loss
+        + load_balance_weight * load_balance_loss
+    )
+def train_epoch(
+    model: nn.Module,
+    dataloader: DataLoader,
+    optimizer: optim.Optimizer,
+    device: torch.device,
+) -> float:
+    """
+    Train the mixture of experts model for one epoch.
+    Parameters
+    ----------
+    model : nn.Module
+        The neural network model to train.
+    dataloader : DataLoader
+        DataLoader providing training batches (X, y).
+    optimizer : optim.Optimizer
+        Optimizer for updating model parameters.
+    device : torch.device
+        Device to run training on.
+    Returns
+    -------
+    float
+        Average loss for the epoch.
+    Notes
+    -----
+    Loss includes both prediction error and load balancing term.
+    The loss function is computed by precision_focused_loss which is
+    passed gate_weights for load balancing.
+    """
+    model.train()
+    total_loss = 0.0
+    for X_batch, y_batch in dataloader:
+        X_batch = X_batch.to(device)
+        y_batch = y_batch.to(device)
+        outputs, gate_weights = model(X_batch)
+        loss = precision_focused_loss(
+            outputs, y_batch, gate_weights, model.load_balance_weight
+        )
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(dataloader)
+def validate(model: nn.Module, dataloader: DataLoader, device: torch.device) -> float:
+    """
+    Validate the mixture of experts model on validation set.
+    Parameters
+    ----------
+    model : nn.Module
+        The neural network model to validate.
+    dataloader : DataLoader
+        DataLoader providing validation batches (X, y).
+    device : torch.device
+        Device to run validation on.
+    Returns
+    -------
+    float
+        Average loss for the validation set.
+    """
+    model.eval()
+    total_loss = 0.0
+    with torch.no_grad():
+        for X_batch, y_batch in dataloader:
+            X_batch = X_batch.to(device)
+            y_batch = y_batch.to(device)
+            outputs, gate_weights = model(X_batch)
+            loss = precision_focused_loss(
+                outputs, y_batch, gate_weights, model.load_balance_weight
+            )
+            total_loss += loss.item()
+    return total_loss / len(dataloader)
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=3e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train mixture of experts model for xyY to Munsell conversion.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads normalization parameters from existing config
+    2. Loads training data from cache
+    3. Normalizes inputs and outputs to [0, 1] range
+    4. Creates PyTorch DataLoaders
+    5. Initializes MixtureOfExperts with 4 expert networks
+    6. Trains with AdamW optimizer and precision-focused loss
+    7. Uses learning rate scheduler (ReduceLROnPlateau)
+    8. Implements early stopping based on validation loss
+    9. Exports best model to ONNX format
+    10. Logs all metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Mixture of Experts: xyY → Munsell")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    # Load training data
+    LOGGER.info("")
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    # Use hardcoded ranges covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MixtureOfExperts(num_experts=4, num_residual_blocks=2).to(device)
+    LOGGER.info("")
+    LOGGER.info("Mixture of Experts architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "mixture_of_experts")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "mixture_of_experts",
+                "learning_rate": lr,
+                "batch_size": batch_size,
+                "num_epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, device)
+            val_loss = validate(model, val_loader, device)
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "mixture_of_experts_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX (simplified - outputs only prediction, not gate weights)
+        LOGGER.info("")
+        LOGGER.info("Exporting to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        # Create wrapper for ONNX export (only return prediction)
+        class MoEWrapper(nn.Module):
+            def __init__(self, moe_model: nn.Module) -> None:
+                super().__init__()
+                self.moe_model = moe_model
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                output, _ = self.moe_model(x)
+                return output
+        wrapped_model = MoEWrapper(model).to(device)
+        wrapped_model.eval()
+        dummy_input = torch.randn(1, 3).to(device)
+        onnx_file = model_directory / "mixture_of_experts.onnx"
+        torch.onnx.export(
+            wrapped_model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={
+                "xyY": {0: "batch_size"},
+                "munsell_spec": {0: "batch_size"},
+            },
+        )
+        # Save normalization parameters alongside model
+        params_file = model_directory / "mixture_of_experts_normalization_params.npz"
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_mlp.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Train ML model for xyY to Munsell conversion.
+This script trains a compact MLP/DNN model with architecture:
+3 inputs → [64, 128, 128, 64] hidden layers → 4 outputs
+Target: < 1e-7 accuracy compared to iterative algorithm
+"""
+import logging
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from torch import optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MLPToMunsell
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import weighted_mse_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+@click.command()
+@click.option("--epochs", default=200, help="Maximum training epochs.")
+@click.option("--batch-size", default=1024, help="Training batch size.")
+@click.option("--lr", default=5e-4, help="Learning rate.")
+@click.option("--patience", default=20, help="Early stopping patience.")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train the MLPToMunsell model for xyY to Munsell conversion.
+    Parameters
+    ----------
+    epochs : int
+        Maximum number of training epochs.
+    batch_size : int
+        Training batch size.
+    lr : float
+        Learning rate for AdamW optimizer.
+    patience : int
+        Early stopping patience (epochs without improvement).
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training data from cache
+    2. Normalizes Munsell outputs to [0, 1] range
+    3. Trains compact MLP model (3 → [64, 128, 128, 64] → 4)
+    4. Uses weighted MSE loss function
+    5. Learning rate scheduling with ReduceLROnPlateau
+    6. Early stopping based on validation loss
+    7. Exports model to ONNX format
+    8. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("ML-Based xyY to Munsell Conversion: Model Training")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Load training data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Training data not found at %s", cache_file)
+        LOGGER.error("Please run 01_generate_training_data.py first")
+        return
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    # Note: Invalid samples (outside Munsell gamut) are also stored in the cache
+    # Available as: data['xyY_all'], data['munsell_all'], data['valid_mask']
+    # These can be used for future enhancements like:
+    # - Adversarial training to avoid extrapolation
+    # - Gamut-aware loss functions
+    # - Uncertainty estimation at boundaries
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    # Use hardcoded ranges covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    # Larger batch size for larger dataset (500K samples)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MLPToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Model architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup - lower learning rate for larger model
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    # Use weighted MSE with default weights
+    weights = torch.tensor([1.0, 1.0, 2.0, 0.5])
+    criterion = lambda pred, target: weighted_mse_loss(pred, target, weights)
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "mlp")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        # Log hyperparameters
+        mlflow.log_params(
+            {
+                "epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "optimizer": "Adam",
+                "criterion": "weighted_mse_loss",
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            # Log to MLflow
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                # Save best model
+                model_directory = PROJECT_ROOT / "models" / "from_xyY"
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "mlp_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "output_params": output_params,
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        # Log final metrics
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting model to ONNX...")
+        model.eval()
+        # Load best model
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        # Create dummy input
+        dummy_input = torch.randn(1, 3).to(device)
+        # Export
+        onnx_file = model_directory / "mlp.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY": {0: "batch_size"}, "munsell_spec": {0: "batch_size"}},
+        )
+        # Save normalization parameters alongside model
+        params_file = model_directory / "mlp_normalization_params.npz"
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        # Log artifacts
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        # Log model
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_mlp_attention.py ADDED Viewed

	@@ -0,0 +1,460 @@

+"""
+Train MLP + Self-Attention model for xyY to Munsell conversion.
+Option 1: MLP backbone with multi-head self-attention layers
+- Input: 3 features (xyY)
+- Architecture: 3 -> 512 -> 1024 + [Attention + ResBlock] x 4 -> 512 -> 4
+- Output: 4 features (hue, value, chroma, code)
+"""
+import logging
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import ResidualBlock
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import precision_focused_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+class MultiHeadSelfAttention(nn.Module):
+    """
+    Multi-head self-attention layer for feature interaction.
+    Implements scaled dot-product attention with multiple heads to capture
+    different aspects of feature relationships.
+    Parameters
+    ----------
+    dim
+        Input and output feature dimension.
+    num_heads
+        Number of attention heads. Must divide ``dim`` evenly.
+    Attributes
+    ----------
+    query
+        Linear projection for query vectors.
+    key
+        Linear projection for key vectors.
+    value
+        Linear projection for value vectors.
+    out
+        Output projection after attention.
+    scale
+        Scaling factor (1/sqrt(head_dim)) for dot-product attention.
+    """
+    def __init__(self, dim: int, num_heads: int = 4) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        self.head_dim = dim // num_heads
+        assert dim % num_heads == 0, "dim must be divisible by num_heads"  # noqa: S101
+        self.query = nn.Linear(dim, dim)
+        self.key = nn.Linear(dim, dim)
+        self.value = nn.Linear(dim, dim)
+        self.out = nn.Linear(dim, dim)
+        self.scale = self.head_dim**-0.5
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply multi-head self-attention.
+        Parameters
+        ----------
+        x
+            Input tensor of shape ``(batch_size, dim)``.
+        Returns
+        -------
+        torch.Tensor
+            Output tensor of shape ``(batch_size, dim)`` with attention applied.
+        """
+        batch_size = x.size(0)
+        # Linear projections
+        Q = self.query(x).view(batch_size, self.num_heads, self.head_dim)
+        K = self.key(x).view(batch_size, self.num_heads, self.head_dim)
+        V = self.value(x).view(batch_size, self.num_heads, self.head_dim)
+        # Scaled dot-product attention
+        attn_weights = torch.softmax(
+            torch.matmul(Q, K.transpose(-2, -1)) * self.scale, dim=-1
+        )
+        # Apply attention to values
+        attn_output = torch.matmul(attn_weights, V)
+        # Concatenate heads and project
+        attn_output = attn_output.view(batch_size, self.dim)
+        return self.out(attn_output)
+class AttentionResBlock(nn.Module):
+    """
+    Combined attention and residual block.
+    Applies self-attention followed by a residual MLP block, each with
+    batch normalization and skip connections.
+    Parameters
+    ----------
+    dim
+        Input and output feature dimension.
+    num_heads
+        Number of attention heads for the self-attention layer.
+    Attributes
+    ----------
+    attention
+        Multi-head self-attention layer.
+    norm1
+        Batch normalization after attention.
+    residual
+        Residual MLP block.
+    norm2
+        Batch normalization after residual block.
+    """
+    def __init__(self, dim: int, num_heads: int = 4) -> None:
+        super().__init__()
+        self.attention = MultiHeadSelfAttention(dim, num_heads)
+        self.norm1 = nn.BatchNorm1d(dim)
+        self.residual = ResidualBlock(dim)
+        self.norm2 = nn.BatchNorm1d(dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply attention and residual transformations.
+        Parameters
+        ----------
+        x
+            Input tensor of shape ``(batch_size, dim)``.
+        Returns
+        -------
+        torch.Tensor
+            Output tensor of shape ``(batch_size, dim)``.
+        """
+        # Attention with residual
+        attn_out = self.norm1(x + self.attention(x))
+        # ResBlock with residual
+        return self.norm2(self.residual(attn_out))
+class MLPAttention(nn.Module):
+    """
+    MLP with self-attention for xyY to Munsell conversion.
+    Architecture:
+        - Input: 3 features (xyY normalized to [0, 1])
+        - Encoder: 3 -> 512 -> 1024
+        - Attention-ResBlocks at 1024-dim (configurable count)
+        - Decoder: 1024 -> 512 -> 4
+        - Output: 4 features (hue, value, chroma, code normalized)
+    Parameters
+    ----------
+    num_blocks
+        Number of attention-residual blocks in the middle.
+    num_heads
+        Number of attention heads in each attention layer.
+    Attributes
+    ----------
+    encoder
+        MLP that projects 3D xyY input to 1024D feature space.
+    blocks
+        List of AttentionResBlock modules.
+    decoder
+        MLP that projects 1024D features to 4D Munsell output.
+    """
+    def __init__(self, num_blocks: int = 4, num_heads: int = 4) -> None:
+        super().__init__()
+        # Encoder
+        self.encoder = nn.Sequential(
+            nn.Linear(3, 512),
+            nn.GELU(),
+            nn.BatchNorm1d(512),
+            nn.Linear(512, 1024),
+            nn.GELU(),
+            nn.BatchNorm1d(1024),
+        )
+        # Attention-ResBlocks
+        self.blocks = nn.ModuleList(
+            [AttentionResBlock(1024, num_heads) for _ in range(num_blocks)]
+        )
+        # Decoder
+        self.decoder = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.GELU(),
+            nn.BatchNorm1d(512),
+            nn.Linear(512, 4),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Predict Munsell specification from xyY input.
+        Parameters
+        ----------
+        x
+            Input tensor of shape ``(batch_size, 3)`` containing normalized
+            xyY values.
+        Returns
+        -------
+        torch.Tensor
+            Output tensor of shape ``(batch_size, 4)`` containing normalized
+            Munsell specification [hue, value, chroma, code].
+        """
+        # Encode
+        x = self.encoder(x)
+        # Attention-ResBlocks
+        for block in self.blocks:
+            x = block(x)
+        # Decode
+        return self.decoder(x)
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=3e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train MLP + Self-Attention model for xyY to Munsell conversion.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads normalization parameters and training data from disk
+    2. Normalizes inputs (xyY) and outputs (Munsell specification) to [0, 1]
+    3. Creates MLPAttention model (4 blocks, 4 attention heads)
+    4. Trains with precision-focused loss (MSE + MAE + log + Huber)
+    5. Uses AdamW optimizer with ReduceLROnPlateau scheduler
+    6. Applies early stopping based on validation loss (patience=20)
+    7. Exports best model to ONNX format
+    8. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("MLP + Self-Attention: xyY → Munsell")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    # Load training data
+    LOGGER.info("")
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MLPAttention(num_blocks=4, num_heads=4).to(device)
+    LOGGER.info("")
+    LOGGER.info("MLP + Attention architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    criterion = precision_focused_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "mlp_attention")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        # Log hyperparameters
+        mlflow.log_params(
+            {
+                "num_epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "weight_decay": 1e-5,
+                "optimizer": "AdamW",
+                "scheduler": "ReduceLROnPlateau",
+                "criterion": "precision_focused_loss",
+                "patience": patience,
+                "total_params": total_params,
+                "num_blocks": 4,
+                "num_heads": 4,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "mlp_attention_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        # Log final metrics
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 3).to(device)
+        onnx_file = model_directory / "mlp_attention.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={
+                "xyY": {0: "batch_size"},
+                "munsell_spec": {0: "batch_size"},
+            },
+        )
+        # Save normalization parameters alongside model
+        params_file = model_directory / "mlp_attention_normalization_params.npz"
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        # Log artifacts
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        # Log model
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_mlp_error_predictor.py ADDED Viewed

	@@ -0,0 +1,457 @@

+"""
+Train error predictor with advanced MLP architecture.
+Architecture features:
+- Larger capacity: 7 → 256 → 512 → 512 → 256 → 4
+- Residual connections (MLP-style) for better gradient flow
+- Modern activation functions (GELU instead of ReLU)
+- Precision-focused loss function
+Generic error predictor that can work with any base model.
+"""
+import logging
+from pathlib import Path
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import onnxruntime as ort
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import ResidualBlock
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import normalize_munsell, normalize_xyY
+from learning_munsell.utilities.losses import precision_focused_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+# Note: This script has a custom ErrorPredictorMLP architecture
+# so we don't import ComponentErrorPredictor/MultiHeadErrorPredictor from shared modules.
+class ErrorPredictorMLP(nn.Module):
+    """
+    Advanced error predictor with residual connections.
+    This model implements a two-stage architecture for Munsell color prediction:
+    1. Base model makes initial predictions from xyY coordinates
+    2. Error predictor learns residual corrections to improve base predictions
+    The error predictor uses MLP-style residual blocks for better gradient
+    flow and deeper representations. It takes both the input xyY coordinates
+    and the base model's predictions to predict the error that should be added
+    to the base predictions.
+    Architecture:
+    - Input: 7 features (xyY_norm + base_pred_norm)
+    - Encoder: 7 → 256 → 512
+    - Residual blocks at 512-dim
+    - Decoder: 512 → 256 → 128 → 4
+    - Uses GELU activations and residual connections
+    Parameters
+    ----------
+    num_residual_blocks : int, optional
+        Number of residual blocks to use in the middle of the network.
+        Default is 3.
+    Attributes
+    ----------
+    encoder : nn.Sequential
+        Encoder network that maps 7D input to 512D representation.
+    residual_blocks : nn.ModuleList
+        List of residual blocks for deep feature extraction.
+    decoder : nn.Sequential
+        Decoder network that maps 512D representation to 4D error prediction.
+    """
+    def __init__(self, num_residual_blocks: int = 3) -> None:
+        super().__init__()
+        # Encoder
+        self.encoder = nn.Sequential(
+            nn.Linear(7, 256),
+            nn.GELU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 512),
+            nn.GELU(),
+            nn.BatchNorm1d(512),
+        )
+        # Residual blocks
+        self.residual_blocks = nn.ModuleList(
+            [ResidualBlock(512) for _ in range(num_residual_blocks)]
+        )
+        # Decoder
+        self.decoder = nn.Sequential(
+            nn.Linear(512, 256),
+            nn.GELU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 128),
+            nn.GELU(),
+            nn.BatchNorm1d(128),
+            nn.Linear(128, 4),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through the error predictor.
+        Parameters
+        ----------
+        x : Tensor
+            Combined input [xyY_norm, base_pred_norm] of shape (batch_size, 7).
+        Returns
+        -------
+        Tensor
+            Predicted error correction of shape (batch_size, 4).
+        """
+        # Encode
+        x = self.encoder(x)
+        # Residual blocks
+        for block in self.residual_blocks:
+            x = block(x)
+        # Decode
+        return self.decoder(x)
+def load_base_model(
+    model_path: Path, params_path: Path
+) -> tuple[ort.InferenceSession, dict, dict]:
+    """
+    Load the base ONNX model and its normalization parameters.
+    The base model is the first stage of the two-stage architecture that makes
+    initial predictions from xyY coordinates to Munsell specifications.
+    Parameters
+    ----------
+    model_path : Path
+        Path to the ONNX model file.
+    params_path : Path
+        Path to the .npz file containing input and output normalization parameters.
+    Returns
+    -------
+    session : ort.InferenceSession
+        ONNX Runtime inference session for the base model.
+    input_params : dict
+        Dictionary containing input normalization ranges (x_range, y_range, Y_range).
+    output_params : dict
+        Dictionary containing output normalization ranges (hue_range, value_range,
+        chroma_range, code_range).
+    """
+    session = ort.InferenceSession(str(model_path))
+    params = np.load(params_path, allow_pickle=True)
+    return session, params["input_params"].item(), params["output_params"].item()
+@click.command()
+@click.option(
+    "--base-model",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to base model ONNX file",
+)
+@click.option(
+    "--params",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to normalization params file",
+)
+@click.option(
+    "--epochs",
+    type=int,
+    default=200,
+    help="Number of training epochs",
+)
+@click.option(
+    "--batch-size",
+    type=int,
+    default=1024,
+    help="Batch size for training",
+)
+@click.option(
+    "--lr",
+    type=float,
+    default=3e-4,
+    help="Learning rate",
+)
+@click.option(
+    "--patience",
+    type=int,
+    default=20,
+    help="Patience for early stopping",
+)
+def main(
+    base_model: Path | None,
+    params: Path | None,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    patience: int,
+) -> None:
+    """
+    Train error predictor with advanced MLP architecture.
+    Parameters
+    ----------
+    base_model : Path or None
+        Path to the base model ONNX file. If None, uses default path.
+    params : Path or None
+        Path to normalization parameters .npz file. If None, uses default path.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads pre-trained base model
+    2. Generates base model predictions for training data
+    3. Computes residual errors between predictions and targets
+    4. Trains error predictor on these residuals
+    5. Uses precision-focused loss function
+    6. Learning rate scheduling with ReduceLROnPlateau
+    7. Early stopping based on validation loss
+    8. Exports model to ONNX format
+    9. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Error Predictor: MLP + GELU + Precision Loss")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    base_model_path = base_model
+    params_path = params
+    cache_file = data_dir / "training_data.npz"
+    # Extract base model name for error predictor naming
+    base_model_name = (
+        base_model_path.stem if base_model_path else "xyY_to_munsell_specification"
+    )
+    # Load base model
+    LOGGER.info("")
+    LOGGER.info("Loading base model from %s...", base_model_path)
+    base_session, input_params, output_params = load_base_model(
+        base_model_path, params_path
+    )
+    # Load training data
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Generate base model predictions
+    LOGGER.info("")
+    LOGGER.info("Generating base model predictions...")
+    X_train_norm = normalize_xyY(X_train, input_params)
+    y_train_norm = normalize_munsell(y_train, output_params)
+    # Base predictions (normalized)
+    base_pred_train_norm = base_session.run(None, {"xyY": X_train_norm})[0]
+    X_val_norm = normalize_xyY(X_val, input_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    base_pred_val_norm = base_session.run(None, {"xyY": X_val_norm})[0]
+    # Compute errors (in normalized space)
+    error_train = y_train_norm - base_pred_train_norm
+    error_val = y_val_norm - base_pred_val_norm
+    # Statistics
+    LOGGER.info("")
+    LOGGER.info("Base model error statistics (normalized space):")
+    LOGGER.info("  Mean absolute error: %.6f", np.mean(np.abs(error_train)))
+    LOGGER.info("  Std of error: %.6f", np.std(error_train))
+    LOGGER.info("  Max absolute error: %.6f", np.max(np.abs(error_train)))
+    # Create combined input: [xyY_norm, base_prediction_norm]
+    X_train_combined = np.concatenate([X_train_norm, base_pred_train_norm], axis=1)
+    X_val_combined = np.concatenate([X_val_norm, base_pred_val_norm], axis=1)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_combined)
+    error_train_t = torch.FloatTensor(error_train)
+    X_val_t = torch.FloatTensor(X_val_combined)
+    error_val_t = torch.FloatTensor(error_val)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, error_train_t)
+    val_dataset = TensorDataset(X_val_t, error_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize error predictor model with MLP architecture
+    model = ErrorPredictorMLP(num_residual_blocks=3).to(device)
+    LOGGER.info("")
+    LOGGER.info("Error predictor architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup with precision-focused loss
+    LOGGER.info("")
+    LOGGER.info("Using precision-focused loss function:")
+    LOGGER.info("  - MSE (weight: 1.0)")
+    LOGGER.info("  - MAE (weight: 0.5)")
+    LOGGER.info("  - Log penalty for small errors (weight: 0.3)")
+    LOGGER.info("  - Huber loss with delta=0.01 (weight: 0.5)")
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    criterion = precision_focused_loss
+    # MLflow setup
+    model_name = f"{base_model_name}_error_predictor"
+    run_name = setup_mlflow_experiment("from_xyY", model_name)
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": model_name,
+                "base_model": base_model_name,
+                "learning_rate": lr,
+                "batch_size": batch_size,
+                "num_epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            # Update learning rate
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                # Save best model
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = (
+                    model_directory / f"{base_model_name}_error_predictor_best.pth"
+                )
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting error predictor to ONNX...")
+        model.eval()
+        # Load best model
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        # Create dummy input (xyY_norm + base_pred_norm = 7 inputs)
+        dummy_input = torch.randn(1, 7).to(device)
+        # Export
+        onnx_file = model_directory / f"{base_model_name}_error_predictor.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["combined_input"],
+            output_names=["error_correction"],
+            dynamic_axes={
+                "combined_input": {0: "batch_size"},
+                "error_correction": {0: "batch_size"},
+            },
+        )
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("Error predictor ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_mlp_gamma.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""
+Train ML model for xyY to Munsell conversion with gamma-corrected Y.
+Experiment: Apply gamma 2.33 to Y before normalization to better align
+with perceptual lightness (Munsell Value scale is perceptually uniform).
+"""
+import logging
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MLPToMunsell
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import weighted_mse_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+# Gamma value for Y transformation
+GAMMA = 2.33
+def normalize_inputs(
+    X: NDArray, gamma: float = GAMMA
+) -> tuple[NDArray, dict[str, Any]]:
+    """
+    Normalize xyY inputs to [0, 1] range with gamma correction on Y.
+    Parameters
+    ----------
+    X : ndarray
+        xyY values of shape (n, 3) where columns are [x, y, Y].
+    gamma : float
+        Gamma value to apply to Y component.
+    Returns
+    -------
+    ndarray
+        Normalized values with gamma-corrected Y, dtype float32.
+    dict
+        Normalization parameters including gamma value.
+    """
+    # Typical ranges for xyY
+    x_range = (0.0, 1.0)
+    y_range = (0.0, 1.0)
+    Y_range = (0.0, 1.0)
+    X_norm = X.copy()
+    X_norm[:, 0] = (X[:, 0] - x_range[0]) / (x_range[1] - x_range[0])
+    X_norm[:, 1] = (X[:, 1] - y_range[0]) / (y_range[1] - y_range[0])
+    # Normalize Y first, then apply gamma
+    Y_normalized = (X[:, 2] - Y_range[0]) / (Y_range[1] - Y_range[0])
+    # Clip to avoid numerical issues with negative values
+    Y_normalized = np.clip(Y_normalized, 0, 1)
+    # Apply gamma: Y_gamma = Y^(1/gamma) - this spreads dark values, compresses light
+    X_norm[:, 2] = np.power(Y_normalized, 1.0 / gamma)
+    params = {
+        "x_range": x_range,
+        "y_range": y_range,
+        "Y_range": Y_range,
+        "gamma": gamma,
+    }
+    return X_norm, params
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=5e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train MLP model with gamma-corrected Y input.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training and validation data from cache
+    2. Normalizes inputs with gamma correction (gamma=2.33) on Y
+    3. Normalizes Munsell outputs to [0, 1] range
+    4. Trains MLP with weighted MSE loss
+    5. Uses early stopping based on validation loss
+    6. Exports best model to ONNX format
+    7. Logs metrics and artifacts to MLflow
+    The gamma correction on Y aligns with perceptual lightness. The gamma
+    transformation spreads dark values and compresses light values, matching
+    human lightness perception and the perceptually uniform Munsell Value scale.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("ML-Based xyY to Munsell Conversion: Gamma Experiment")
+    LOGGER.info("Gamma = %.2f applied to Y component", GAMMA)
+    LOGGER.info("=" * 80)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Load training data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Training data not found at %s", cache_file)
+        LOGGER.error("Please run 01_generate_training_data.py first")
+        return
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize data with gamma correction
+    X_train_norm, input_params = normalize_inputs(X_train, gamma=GAMMA)
+    X_val_norm, _ = normalize_inputs(X_val, gamma=GAMMA)
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    LOGGER.info("")
+    LOGGER.info("Input normalization with gamma=%.2f:", GAMMA)
+    LOGGER.info("  Y range after gamma: [%.4f, %.4f]", X_train_norm[:, 2].min(), X_train_norm[:, 2].max())
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_norm)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val_norm)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MLPToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Model architecture:")
+    LOGGER.info("%s", model)
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    # Component weights: emphasize chroma (2.0), de-emphasize code (0.5)
+    weights = torch.tensor([1.0, 1.0, 2.0, 0.5])
+    criterion = lambda pred, target: weighted_mse_loss(pred, target, weights)
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", f"mlp_gamma_{GAMMA}")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "num_epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "optimizer": "Adam",
+                "criterion": "weighted_mse_loss",
+                "patience": patience,
+                "total_params": total_params,
+                "gamma": GAMMA,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory = PROJECT_ROOT / "models" / "from_xyY"
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "mlp_gamma_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "input_params": input_params,
+                        "output_params": output_params,
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting model to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 3).to(device)
+        onnx_file = model_directory / "mlp_gamma.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY_gamma"],
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY_gamma": {0: "batch_size"}, "munsell_spec": {0: "batch_size"}},
+        )
+        # Save normalization parameters (including gamma)
+        params_file = model_directory / "mlp_gamma_normalization_params.npz"
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("IMPORTANT: Input Y must be gamma-corrected with gamma=%.2f", GAMMA)
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_3stage_error_predictor.py ADDED Viewed

	@@ -0,0 +1,411 @@

+"""
+Train second-stage error predictor for 3-stage model.
+Architecture: Multi-Head + Multi-Error Predictor + Multi-Error Predictor
+- Stage 1: Multi-Head base model (existing)
+- Stage 2: First error predictor (existing)
+- Stage 3: Second error predictor (this script) - learns residuals from stage 2
+The second error predictor has the same architecture as the first but learns
+the remaining errors after the first error correction is applied.
+"""
+import logging
+from pathlib import Path
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import onnxruntime as ort
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import (
+    ComponentErrorPredictor,
+    MultiHeadErrorPredictorToMunsell,
+)
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import normalize_munsell, normalize_xyY
+from learning_munsell.utilities.losses import precision_focused_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+@click.command()
+@click.option(
+    "--base-model",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to Multi-Head base model ONNX file",
+)
+@click.option(
+    "--first-error-predictor",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to first error predictor ONNX file",
+)
+@click.option(
+    "--params",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to normalization params file",
+)
+@click.option(
+    "--epochs",
+    type=int,
+    default=300,
+    help="Number of training epochs (default: 300)",
+)
+@click.option(
+    "--batch-size",
+    type=int,
+    default=2048,
+    help="Batch size for training (default: 2048)",
+)
+@click.option(
+    "--lr",
+    type=float,
+    default=3e-4,
+    help="Learning rate (default: 3e-4)",
+)
+@click.option(
+    "--patience",
+    type=int,
+    default=30,
+    help="Early stopping patience (default: 30)",
+)
+def main(
+    base_model: Path | None,
+    first_error_predictor: Path | None,
+    params: Path | None,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    patience: int,
+) -> None:
+    """
+    Train the second-stage error predictor for the 3-stage model.
+    This script trains the third stage of a 3-stage model:
+    - Stage 1: Multi-Head base model (pre-trained)
+    - Stage 2: First error predictor (pre-trained)
+    - Stage 3: Second error predictor (trained by this script)
+    The second error predictor learns the residual errors remaining after
+    the first error correction is applied, further refining the predictions.
+    Parameters
+    ----------
+    base_model : Path, optional
+        Path to the Multi-Head base model ONNX file.
+        Default: models/from_xyY/multi_head_large.onnx
+    first_error_predictor : Path, optional
+        Path to the first error predictor ONNX file.
+        Default: models/from_xyY/multi_head_multi_error_predictor_large.onnx
+    params : Path, optional
+        Path to the normalization parameters file.
+        Default: models/from_xyY/multi_head_large_normalization_params.npz
+    Notes
+    -----
+    The training pipeline:
+    1. Loads pre-trained Stage 1 and Stage 2 models
+    2. Generates Stage 2 predictions (base + first error correction)
+    3. Computes remaining residual errors
+    4. Trains Stage 3 error predictor on these residuals
+    5. Exports the model to ONNX format
+    6. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Second Error Predictor: 3-Stage Model Training")
+    LOGGER.info("Multi-Head + Multi-Error Predictor + Multi-Error Predictor")
+    LOGGER.info("=" * 80)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    LOGGER.info("Using device: %s", device)
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    if base_model is None:
+        base_model = model_directory / "multi_head_large.onnx"
+    if first_error_predictor is None:
+        first_error_predictor = model_directory / "multi_head_multi_error_predictor_large.onnx"
+    if params is None:
+        params = model_directory / "multi_head_large_normalization_params.npz"
+    cache_file = data_dir / "training_data_large.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Large training data not found at %s", cache_file)
+        return
+    if not base_model.exists():
+        LOGGER.error("Error: Base model not found at %s", base_model)
+        return
+    if not first_error_predictor.exists():
+        LOGGER.error("Error: First error predictor not found at %s", first_error_predictor)
+        return
+    # Load models
+    LOGGER.info("")
+    LOGGER.info("Loading Stage 1: Multi-Head base model from %s...", base_model)
+    base_session = ort.InferenceSession(str(base_model))
+    LOGGER.info("Loading Stage 2: First error predictor from %s...", first_error_predictor)
+    error_predictor_session = ort.InferenceSession(str(first_error_predictor))
+    # Load normalization params
+    params_data = np.load(params, allow_pickle=True)
+    input_params = params_data["input_params"].item()
+    output_params = params_data["output_params"].item()
+    # Load training data
+    LOGGER.info("Loading large training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Generate stage 2 predictions (base + first error correction)
+    LOGGER.info("")
+    LOGGER.info("Computing Stage 2 predictions (base + first error correction)...")
+    X_train_norm = normalize_xyY(X_train, input_params)
+    y_train_norm = normalize_munsell(y_train, output_params)
+    X_val_norm = normalize_xyY(X_val, input_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    inference_batch_size = 50000
+    # Stage 1: Base model predictions
+    LOGGER.info("  Stage 1: Base model predictions (training set)...")
+    base_pred_train = []
+    for i in range(0, len(X_train_norm), inference_batch_size):
+        batch = X_train_norm[i : i + inference_batch_size]
+        pred = base_session.run(None, {"xyY": batch})[0]
+        base_pred_train.append(pred)
+    base_pred_train = np.concatenate(base_pred_train, axis=0)
+    LOGGER.info("  Stage 1: Base model predictions (validation set)...")
+    base_pred_val = []
+    for i in range(0, len(X_val_norm), inference_batch_size):
+        batch = X_val_norm[i : i + inference_batch_size]
+        pred = base_session.run(None, {"xyY": batch})[0]
+        base_pred_val.append(pred)
+    base_pred_val = np.concatenate(base_pred_val, axis=0)
+    # Stage 2: First error predictor corrections
+    LOGGER.info("  Stage 2: First error predictor corrections (training set)...")
+    combined_train = np.concatenate([X_train_norm, base_pred_train], axis=1).astype(np.float32)
+    error_correction_train = []
+    for i in range(0, len(combined_train), inference_batch_size):
+        batch = combined_train[i : i + inference_batch_size]
+        correction = error_predictor_session.run(None, {"combined_input": batch})[0]
+        error_correction_train.append(correction)
+    error_correction_train = np.concatenate(error_correction_train, axis=0)
+    LOGGER.info("  Stage 2: First error predictor corrections (validation set)...")
+    combined_val = np.concatenate([X_val_norm, base_pred_val], axis=1).astype(np.float32)
+    error_correction_val = []
+    for i in range(0, len(combined_val), inference_batch_size):
+        batch = combined_val[i : i + inference_batch_size]
+        correction = error_predictor_session.run(None, {"combined_input": batch})[0]
+        error_correction_val.append(correction)
+    error_correction_val = np.concatenate(error_correction_val, axis=0)
+    # Stage 2 predictions (base + first error correction)
+    stage2_pred_train = base_pred_train + error_correction_train
+    stage2_pred_val = base_pred_val + error_correction_val
+    # Compute remaining errors for stage 3
+    error_train = y_train_norm - stage2_pred_train
+    error_val = y_val_norm - stage2_pred_val
+    # Statistics
+    LOGGER.info("")
+    LOGGER.info("Stage 2 prediction error statistics (normalized space):")
+    LOGGER.info("  Mean absolute error: %.6f", np.mean(np.abs(error_train)))
+    LOGGER.info("  Std of error: %.6f", np.std(error_train))
+    LOGGER.info("  Max absolute error: %.6f", np.max(np.abs(error_train)))
+    # Compare with stage 1 errors
+    stage1_error_train = y_train_norm - base_pred_train
+    LOGGER.info("")
+    LOGGER.info("Stage 1 (base only) error statistics for comparison:")
+    LOGGER.info("  Mean absolute error: %.6f", np.mean(np.abs(stage1_error_train)))
+    LOGGER.info("  Std of error: %.6f", np.std(stage1_error_train))
+    error_reduction = (
+        (np.mean(np.abs(stage1_error_train)) - np.mean(np.abs(error_train)))
+        / np.mean(np.abs(stage1_error_train))
+        * 100
+    )
+    LOGGER.info("")
+    LOGGER.info("Stage 2 error reduction vs Stage 1: %.1f%%", error_reduction)
+    # Create combined input for stage 3: [xyY_norm, stage2_pred_norm]
+    X_train_combined = np.concatenate([X_train_norm, stage2_pred_train], axis=1)
+    X_val_combined = np.concatenate([X_val_norm, stage2_pred_val], axis=1)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_combined)
+    error_train_t = torch.FloatTensor(error_train)
+    X_val_t = torch.FloatTensor(X_val_combined)
+    error_val_t = torch.FloatTensor(error_val)
+    train_dataset = TensorDataset(X_train_t, error_train_t)
+    val_dataset = TensorDataset(X_val_t, error_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize second error predictor (same architecture as first)
+    model = MultiHeadErrorPredictorToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Stage 3: Second error predictor architecture:")
+    LOGGER.info("%s", model)
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=10
+    )
+    criterion = precision_focused_loss
+    run_name = setup_mlflow_experiment("from_xyY", "multi_head_3stage_error_predictor")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting Stage 3 training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "multi_head_3stage_error_predictor",
+                "num_epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "weight_decay": 1e-5,
+                "optimizer": "AdamW",
+                "scheduler": "ReduceLROnPlateau",
+                "criterion": "precision_focused_loss",
+                "patience": patience,
+                "total_params": total_params,
+                "train_samples": len(X_train),
+                "val_samples": len(X_val),
+                "stage2_error_reduction_pct": error_reduction,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "multi_head_3stage_error_predictor_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting Stage 3 error predictor to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file, weights_only=False)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 7).to(device)
+        onnx_file = model_directory / "multi_head_3stage_error_predictor.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["combined_input"],
+            output_names=["error_correction"],
+            dynamic_axes={
+                "combined_input": {0: "batch_size"},
+                "error_correction": {0: "batch_size"},
+            },
+        )
+        LOGGER.info("Stage 3 error predictor ONNX model saved to: %s", onnx_file)
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_circular.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""
+Train Multi-Head model with circular hue loss for xyY to Munsell conversion.
+This version uses circular loss for the hue component (which wraps from 0-10)
+to avoid penalizing predictions near the boundary.
+Key Difference from Standard Training:
+- Uses munsell_component_loss() which applies circular MSE for hue
+- and regular MSE for value/chroma/code components
+"""
+from __future__ import annotations
+import copy
+import logging
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.utilities.common import setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.training.from_xyY.hyperparameter_search_multi_head import (
+    MultiHeadParametric,
+)
+LOGGER = logging.getLogger(__name__)
+def circular_mse_loss(
+    pred_hue: torch.Tensor, target_hue: torch.Tensor, hue_range: float = 1.0
+) -> torch.Tensor:
+    """
+    Circular MSE loss for hue component (normalized 0-1).
+    Parameters
+    ----------
+    pred_hue : Tensor
+        Predicted hue values (normalized 0-1)
+    target_hue : Tensor
+        Target hue values (normalized 0-1)
+    hue_range : float
+        Range of hue values (1.0 for normalized)
+    Returns
+    -------
+    Tensor
+        Circular MSE loss
+    """
+    diff = torch.abs(pred_hue - target_hue)
+    circular_diff = torch.min(diff, hue_range - diff)
+    return torch.mean(circular_diff**2)
+def munsell_component_loss(
+    pred: torch.Tensor, target: torch.Tensor, hue_range: float = 1.0
+) -> torch.Tensor:
+    """
+    Component-wise loss for Munsell predictions.
+    Uses circular MSE for hue (component 0) and regular MSE
+    for value, chroma, code (components 1-3).
+    Parameters
+    ----------
+    pred : Tensor
+        Predictions [hue, value, chroma, code] (shape: [batch, 4])
+    target : Tensor
+        Ground truth [hue, value, chroma, code] (shape: [batch, 4])
+    hue_range : float
+        Range of normalized hue values (default 1.0)
+    Returns
+    -------
+    Tensor
+        Combined loss
+    """
+    hue_loss = circular_mse_loss(pred[:, 0], target[:, 0], hue_range)
+    other_loss = nn.functional.mse_loss(pred[:, 1:], target[:, 1:])
+    return hue_loss + other_loss
+@click.command()
+@click.option("--epochs", default=300, help="Number of training epochs")
+@click.option("--batch-size", default=512, help="Batch size for training")
+@click.option("--lr", default=0.000837, help="Learning rate")
+@click.option("--patience", default=30, help="Early stopping patience")
+def main(
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    patience: int,
+    encoder_width: float = 0.75,
+    head_width: float = 1.5,
+    chroma_head_width: float = 1.5,
+    dropout: float = 0.0,
+    weight_decay: float = 0.000013,
+) -> tuple[MultiHeadParametric, float]:
+    """
+    Train Multi-Head model with circular hue loss.
+    This script uses circular loss for the hue component (which wraps from
+    0-10) to avoid penalizing predictions near the boundary.
+    Parameters
+    ----------
+    epochs : int, optional
+        Maximum number of training epochs.
+    batch_size : int, optional
+        Training batch size.
+    lr : float, optional
+        Learning rate for AdamW optimizer.
+    encoder_width : float, optional
+        Width multiplier for the shared encoder.
+    head_width : float, optional
+        Width multiplier for hue, value, and code heads.
+    chroma_head_width : float, optional
+        Width multiplier for chroma head (typically larger).
+    dropout : float, optional
+        Dropout rate for regularization.
+    weight_decay : float, optional
+        Weight decay for AdamW optimizer.
+    Returns
+    -------
+    model : MultiHeadParametric
+        Trained model with best validation loss weights.
+    best_val_loss : float
+        Best validation loss achieved during training.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training data from cache
+    2. Normalizes outputs to [0, 1] range
+    3. Trains with circular MSE for hue and regular MSE for other components
+    4. Uses CosineAnnealingLR scheduler
+    5. Early stopping based on validation loss
+    6. Exports model to ONNX format
+    7. Logs metrics and artifacts to MLflow
+    The circular loss experiment showed that while mathematically correct,
+    the circular distance creates gradient discontinuities that harm
+    optimization. This model is included for comparison purposes.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Training Multi-Head (Circular Hue Loss) for xyY to Munsell conversion")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Using Circular Loss for Hue Component")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Hyperparameters:")
+    LOGGER.info("  lr: %.6f", lr)
+    LOGGER.info("  batch_size: %d", batch_size)
+    LOGGER.info("  encoder_width: %.2f", encoder_width)
+    LOGGER.info("  head_width: %.2f", head_width)
+    LOGGER.info("  chroma_head_width: %.2f", chroma_head_width)
+    LOGGER.info("  dropout: %.2f", dropout)
+    LOGGER.info("  weight_decay: %.6f", weight_decay)
+    LOGGER.info("")
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Load data from cache
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Training samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs already in [0, 1] range)
+    # Use shared normalization parameters covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to tensors
+    X_train_t = torch.from_numpy(X_train).float()
+    y_train_t = torch.from_numpy(y_train_norm).float()
+    X_val_t = torch.from_numpy(X_val).float()
+    y_val_t = torch.from_numpy(y_val_norm).float()
+    train_loader = DataLoader(
+        TensorDataset(X_train_t, y_train_t), batch_size=batch_size, shuffle=True
+    )
+    val_loader = DataLoader(
+        TensorDataset(X_val_t, y_val_t), batch_size=batch_size, shuffle=False
+    )
+    # Create model
+    model = MultiHeadParametric(
+        encoder_width=encoder_width,
+        head_width=head_width,
+        chroma_head_width=chroma_head_width,
+        dropout=dropout,
+    ).to(device)
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("")
+    LOGGER.info("Model parameters: %s", f"{total_params:,}")
+    encoder_params = sum(p.numel() for p in model.encoder.parameters())
+    hue_params = sum(p.numel() for p in model.hue_head.parameters())
+    value_params = sum(p.numel() for p in model.value_head.parameters())
+    chroma_params = sum(p.numel() for p in model.chroma_head.parameters())
+    code_params = sum(p.numel() for p in model.code_head.parameters())
+    LOGGER.info("  - Shared encoder (%.2fx): %s", encoder_width, f"{encoder_params:,}")
+    LOGGER.info("  - Hue head (%.2fx): %s", head_width, f"{hue_params:,}")
+    LOGGER.info("  - Value head (%.2fx): %s", head_width, f"{value_params:,}")
+    LOGGER.info("  - Chroma head (%.2fx): %s", chroma_head_width, f"{chroma_params:,}")
+    LOGGER.info("  - Code head (%.2fx): %s", head_width, f"{code_params:,}")
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "multi_head_circular")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    best_val_loss = float("inf")
+    best_state = None
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training with circular hue loss...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "multi_head_circular",
+                "encoder_width": encoder_width,
+                "head_width": head_width,
+                "chroma_head_width": chroma_head_width,
+                "dropout": dropout,
+                "learning_rate": lr,
+                "batch_size": batch_size,
+                "weight_decay": weight_decay,
+                "epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+                "loss_type": "circular_hue",
+            }
+        )
+        for epoch in range(epochs):
+            # Training
+            model.train()
+            train_loss = 0.0
+            for X_batch, y_batch in train_loader:
+                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+                optimizer.zero_grad()
+                pred = model(X_batch)
+                # Use circular loss for hue component
+                loss = munsell_component_loss(pred, y_batch, hue_range=1.0)
+                loss.backward()
+                optimizer.step()
+                train_loss += loss.item() * len(X_batch)
+            train_loss /= len(X_train_t)
+            scheduler.step()
+            # Validation
+            model.eval()
+            val_loss = 0.0
+            with torch.no_grad():
+                for X_batch, y_batch in val_loader:
+                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+                    pred = model(X_batch)
+                    val_loss += munsell_component_loss(
+                        pred, y_batch, hue_range=1.0
+                    ).item() * len(X_batch)
+            val_loss /= len(X_val_t)
+            # Per-component MAE (denormalized for interpretability)
+            with torch.no_grad():
+                pred_val = model(X_val_t.to(device)).cpu()
+                # Denormalize predictions and ground truth
+                pred_denorm = pred_val.numpy()
+                hue_min, hue_max = output_params["hue_range"]
+                value_min, value_max = output_params["value_range"]
+                chroma_min, chroma_max = output_params["chroma_range"]
+                code_min, code_max = output_params["code_range"]
+                pred_denorm[:, 0] = pred_val[:, 0].numpy() * (hue_max - hue_min) + hue_min  # hue
+                pred_denorm[:, 1] = pred_val[:, 1].numpy() * (value_max - value_min) + value_min  # value
+                pred_denorm[:, 2] = pred_val[:, 2].numpy() * (chroma_max - chroma_min) + chroma_min  # chroma
+                pred_denorm[:, 3] = pred_val[:, 3].numpy() * (code_max - code_min) + code_min  # code
+                y_denorm = y_val_norm.copy()
+                y_denorm[:, 0] = y_val_norm[:, 0] * (hue_max - hue_min) + hue_min
+                y_denorm[:, 1] = y_val_norm[:, 1] * (value_max - value_min) + value_min
+                y_denorm[:, 2] = y_val_norm[:, 2] * (chroma_max - chroma_min) + chroma_min
+                y_denorm[:, 3] = y_val_norm[:, 3] * (code_max - code_min) + code_min
+                mae = np.mean(np.abs(pred_denorm - y_denorm), axis=0)
+            mlflow.log_metrics(
+                {
+                    "train_loss": train_loss,
+                    "val_loss": val_loss,
+                    "mae_hue": mae[0],
+                    "mae_value": mae[1],
+                    "mae_chroma": mae[2],
+                    "mae_code": mae[3],
+                },
+                step=epoch,
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                best_state = copy.deepcopy(model.state_dict())
+                patience_counter = 0
+                LOGGER.info(
+                    "Epoch %03d/%d - Train: %.6f, Val: %.6f (best) - "
+                    "MAE: hue=%.4f, value=%.4f, chroma=%.4f, code=%.4f",
+                    epoch + 1,
+                    epochs,
+                    train_loss,
+                    val_loss,
+                    mae[0],
+                    mae[1],
+                    mae[2],
+                    mae[3],
+                )
+            else:
+                patience_counter += 1
+                if (epoch + 1) % 50 == 0:
+                    LOGGER.info(
+                        "Epoch %03d/%d - Train: %.6f, Val: %.6f",
+                        epoch + 1,
+                        epochs,
+                        train_loss,
+                        val_loss,
+                    )
+            if patience_counter >= patience:
+                LOGGER.info("Early stopping at epoch %d", epoch + 1)
+                break
+        # Load best model
+        model.load_state_dict(best_state)
+        # Final evaluation
+        model.eval()
+        with torch.no_grad():
+            pred_val = model(X_val_t.to(device)).cpu()
+            pred_denorm = pred_val.numpy()
+            hue_min, hue_max = output_params["hue_range"]
+            value_min, value_max = output_params["value_range"]
+            chroma_min, chroma_max = output_params["chroma_range"]
+            code_min, code_max = output_params["code_range"]
+            pred_denorm[:, 0] = pred_val[:, 0].numpy() * (hue_max - hue_min) + hue_min
+            pred_denorm[:, 1] = pred_val[:, 1].numpy() * (value_max - value_min) + value_min
+            pred_denorm[:, 2] = pred_val[:, 2].numpy() * (chroma_max - chroma_min) + chroma_min
+            pred_denorm[:, 3] = pred_val[:, 3].numpy() * (code_max - code_min) + code_min
+            y_denorm = y_val_norm.copy()
+            y_denorm[:, 0] = y_val_norm[:, 0] * (hue_max - hue_min) + hue_min
+            y_denorm[:, 1] = y_val_norm[:, 1] * (value_max - value_min) + value_min
+            y_denorm[:, 2] = y_val_norm[:, 2] * (chroma_max - chroma_min) + chroma_min
+            y_denorm[:, 3] = y_val_norm[:, 3] * (code_max - code_min) + code_min
+            mae = np.mean(np.abs(pred_denorm - y_denorm), axis=0)
+        # Log final metrics
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_mae_hue": mae[0],
+                "final_mae_value": mae[1],
+                "final_mae_chroma": mae[2],
+                "final_mae_code": mae[3],
+                "final_epoch": epoch + 1,
+            }
+        )
+        LOGGER.info("")
+        LOGGER.info("Final Results:")
+        LOGGER.info("  Best Val Loss: %.6f", best_val_loss)
+        LOGGER.info("  MAE hue: %.6f", mae[0])
+        LOGGER.info("  MAE value: %.6f", mae[1])
+        LOGGER.info("  MAE chroma: %.6f", mae[2])
+        LOGGER.info("  MAE code: %.6f", mae[3])
+        # Save model
+        models_dir = PROJECT_ROOT / "models" / "from_xyY"
+        models_dir.mkdir(exist_ok=True)
+        checkpoint_path = models_dir / "multi_head_circular.pth"
+        torch.save(
+            {
+                "model_state_dict": model.state_dict(),
+                "output_params": output_params,
+                "val_loss": best_val_loss,
+                "mae": {
+                    "hue": float(mae[0]),
+                    "value": float(mae[1]),
+                    "chroma": float(mae[2]),
+                    "code": float(mae[3]),
+                },
+                "hyperparameters": {
+                    "encoder_width": encoder_width,
+                    "head_width": head_width,
+                    "chroma_head_width": chroma_head_width,
+                    "dropout": dropout,
+                    "lr": lr,
+                    "batch_size": batch_size,
+                    "weight_decay": weight_decay,
+                },
+                "loss_type": "circular_hue",
+            },
+            checkpoint_path,
+        )
+        LOGGER.info("")
+        LOGGER.info("Saved checkpoint: %s", checkpoint_path)
+        # Export to ONNX
+        model.cpu().eval()
+        dummy_input = torch.randn(1, 3)
+        onnx_path = models_dir / "multi_head_circular.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_path,
+            input_names=["xyY"],  # Match other models for comparison compatibility
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY": {0: "batch"}, "munsell_spec": {0: "batch"}},
+            opset_version=17,
+        )
+        LOGGER.info("Saved ONNX: %s", onnx_path)
+        # Save normalization parameters
+        params_path = models_dir / "multi_head_circular_normalization_params.npz"
+        np.savez(
+            params_path,
+            output_params=output_params,
+        )
+        LOGGER.info("Saved normalization parameters: %s", params_path)
+        # Log artifacts to MLflow
+        mlflow.log_artifact(str(checkpoint_path))
+        mlflow.log_artifact(str(onnx_path))
+        mlflow.log_artifact(str(params_path))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+    return model, best_val_loss
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_cross_attention_error_predictor.py ADDED Viewed

	@@ -0,0 +1,640 @@

+"""
+Train Multi-Head + Cross-Attention Error Predictor for xyY to Munsell conversion.
+This version uses cross-attention between component branches to learn
+correlations between errors in different Munsell components.
+Key Features:
+- Shared context encoder
+- Multi-head cross-attention between components
+- Component-specific prediction heads
+- Residual connections
+"""
+from __future__ import annotations
+import copy
+import logging
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import onnxruntime as ort
+import torch
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import normalize_xyY, normalize_munsell
+LOGGER = logging.getLogger(__name__)
+# Note: This script has a custom CrossAttentionErrorPredictor architecture
+# so we don't import ComponentErrorPredictor/MultiHeadErrorPredictor from shared modules.
+class CustomMultiheadAttention(nn.Module):
+    """
+    Custom multi-head attention that exports cleanly to ONNX.
+    Uses basic operations instead of nn.MultiheadAttention to avoid
+    reshape issues with dynamic batch sizes during ONNX export.
+    Parameters
+    ----------
+    embed_dim : int
+        Total dimension of the model (must be divisible by num_heads).
+    num_heads : int
+        Number of parallel attention heads.
+    dropout : float, optional
+        Dropout probability on attention weights.
+    Attributes
+    ----------
+    embed_dim : int
+        Total embedding dimension.
+    num_heads : int
+        Number of attention heads.
+    head_dim : int
+        Dimension of each attention head (embed_dim // num_heads).
+    scale : float
+        Scaling factor for attention scores (head_dim ** -0.5).
+    q_proj : nn.Linear
+        Query projection layer.
+    k_proj : nn.Linear
+        Key projection layer.
+    v_proj : nn.Linear
+        Value projection layer.
+    out_proj : nn.Linear
+        Output projection layer.
+    dropout : nn.Dropout
+        Dropout layer for attention weights.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+    ) -> None:
+        """Initialize the custom multi-head attention module."""
+        super().__init__()
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.scale = self.head_dim**-0.5
+        # Linear projections for Q, K, V
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for self-attention.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor [batch, seq_len, embed_dim]
+        Returns
+        -------
+        Tensor
+            Output tensor [batch, seq_len, embed_dim]
+        """
+        batch_size, seq_len, embed_dim = x.shape
+        # Project to Q, K, V
+        q = self.q_proj(x)  # [batch, seq_len, embed_dim]
+        k = self.k_proj(x)  # [batch, seq_len, embed_dim]
+        v = self.v_proj(x)  # [batch, seq_len, embed_dim]
+        # Reshape for multi-head attention: [batch, seq_len, num_heads, head_dim]
+        # Then transpose to: [batch, num_heads, seq_len, head_dim]
+        # Use -1 for batch dimension to enable dynamic batch size in ONNX
+        q = q.reshape(-1, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.reshape(-1, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.reshape(-1, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # Scaled dot-product attention
+        # Q @ K^T: [batch, heads, seq, dim] @ [batch, heads, dim, seq]
+        # -> [batch, heads, seq, seq]
+        attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        attn_weights = torch.softmax(attn_scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        # Apply attention to values
+        # [batch, num_heads, seq_len, seq_len] @ [batch, num_heads, seq_len, head_dim]
+        # -> [batch, num_heads, seq_len, head_dim]
+        attn_output = torch.matmul(attn_weights, v)
+        # Transpose back and reshape: [batch, num_heads, seq_len, head_dim]
+        # -> [batch, seq_len, num_heads, head_dim]
+        # -> [batch, seq_len, embed_dim]
+        # Use -1 for batch dimension to enable dynamic batch size in ONNX
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(-1, seq_len, self.embed_dim)
+        # Final projection
+        output = self.out_proj(attn_output)
+        return output
+class CrossAttentionErrorPredictor(nn.Module):
+    """
+    Error predictor with cross-attention between Munsell components.
+    Uses cross-attention to learn correlations between errors in different
+    Munsell components (hue, value, chroma, code).
+    Parameters
+    ----------
+    input_dim : int, optional
+        Input dimension (7 = xyY_norm + base_pred_norm).
+    context_dim : int, optional
+        Dimension of shared context features.
+    component_dim : int, optional
+        Dimension of component-specific features.
+    n_components : int, optional
+        Number of Munsell components (4).
+    n_attention_heads : int, optional
+        Number of attention heads for cross-attention.
+    dropout : float, optional
+        Dropout probability.
+    Attributes
+    ----------
+    context_encoder : nn.Sequential
+        Shared encoder: input_dim → 256 → context_dim.
+    component_encoders : nn.ModuleList
+        Component-specific encoders: context_dim → component_dim (x4).
+    cross_attention : CustomMultiheadAttention
+        Cross-attention module between component features.
+    attention_norm : nn.LayerNorm
+        Layer normalization after attention.
+    component_decoders : nn.ModuleList
+        Component-specific decoders: component_dim → 128 → 1 (x4).
+    Notes
+    -----
+    Architecture:
+    1. Shared context encoder: 7 → 256 → 512
+    2. Component-specific encoders: 512 → 256 (x4)
+    3. Multi-head cross-attention between components
+    4. Residual connection + layer norm
+    5. Component-specific decoders: 256 → 128 → 1
+    """
+    def __init__(
+        self,
+        input_dim: int = 7,
+        context_dim: int = 512,
+        component_dim: int = 256,
+        n_components: int = 4,
+        n_attention_heads: int = 4,
+        dropout: float = 0.1,
+    ) -> None:
+        """Initialize the cross-attention error predictor."""
+        super().__init__()
+        self.n_components = n_components
+        self.component_dim = component_dim
+        # Shared context encoder
+        self.context_encoder = nn.Sequential(
+            nn.Linear(input_dim, 256),
+            nn.GELU(),
+            nn.LayerNorm(256),
+            nn.Dropout(dropout),
+            nn.Linear(256, context_dim),
+            nn.GELU(),
+            nn.LayerNorm(context_dim),
+        )
+        # Component-specific encoders
+        self.component_encoders = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Linear(context_dim, component_dim),
+                    nn.GELU(),
+                    nn.LayerNorm(component_dim),
+                )
+                for _ in range(n_components)
+            ]
+        )
+        # Multi-head cross-attention (using custom implementation)
+        self.cross_attention = CustomMultiheadAttention(
+            embed_dim=component_dim,
+            num_heads=n_attention_heads,
+            dropout=dropout,
+        )
+        # Layer norm after attention
+        self.attention_norm = nn.LayerNorm(component_dim)
+        # Component-specific decoders
+        self.component_decoders = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Linear(component_dim, 128),
+                    nn.GELU(),
+                    nn.LayerNorm(128),
+                    nn.Dropout(dropout),
+                    nn.Linear(128, 1),
+                )
+                for _ in range(n_components)
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with cross-attention.
+        Parameters
+        ----------
+        x : Tensor
+            Input [xyY_norm (3) + base_pred_norm (4)] = 7 features
+        Returns
+        -------
+        Tensor
+            Predicted errors [hue_err, value_err, chroma_err, code_err]
+        """
+        # Shared context encoding
+        context = self.context_encoder(x)  # [batch, 512]
+        # Component-specific encoding
+        component_features = []
+        for encoder in self.component_encoders:
+            feat = encoder(context)  # [batch, 256]
+            component_features.append(feat)
+        # Stack for cross-attention: [batch, 4, 256]
+        component_stack = torch.stack(component_features, dim=1)
+        # Cross-attention between components
+        attended = self.cross_attention(component_stack)  # [batch, 4, 256]
+        # Residual connection + layer norm
+        component_stack = self.attention_norm(component_stack + attended)
+        # Component-specific decoding (unrolled for ONNX compatibility)
+        # Use unbind to split the tensor instead of indexing to preserve batch dimension
+        components = torch.unbind(
+            component_stack, dim=1
+        )  # Split into 4 tensors of shape [batch, 256]
+        # Decode each component explicitly
+        pred_0 = self.component_decoders[0](components[0])  # [batch, 1]
+        pred_1 = self.component_decoders[1](components[1])  # [batch, 1]
+        pred_2 = self.component_decoders[2](components[2])  # [batch, 1]
+        pred_3 = self.component_decoders[3](components[3])  # [batch, 1]
+        # Concatenate along dimension 1 and squeeze
+        predictions = torch.cat([pred_0, pred_1, pred_2, pred_3], dim=1)  # [batch, 4]
+        return predictions
+def train_cross_attention_error_predictor(
+    epochs: int = 300,
+    batch_size: int = 1024,
+    lr: float = 0.0005,
+    dropout: float = 0.1,
+    context_dim: int = 512,
+    component_dim: int = 256,
+    n_attention_heads: int = 4,
+) -> tuple[CrossAttentionErrorPredictor, float]:
+    """
+    Train cross-attention error predictor.
+    This model uses cross-attention between component branches to learn
+    correlations between errors in different Munsell components.
+    Parameters
+    ----------
+    epochs : int, optional
+        Maximum number of training epochs.
+    batch_size : int, optional
+        Training batch size.
+    lr : float, optional
+        Learning rate for AdamW optimizer.
+    dropout : float, optional
+        Dropout rate for regularization.
+    context_dim : int, optional
+        Dimension of shared context features.
+    component_dim : int, optional
+        Dimension of component-specific features.
+    n_attention_heads : int, optional
+        Number of attention heads for cross-attention.
+    Returns
+    -------
+    model : CrossAttentionErrorPredictor
+        Trained model with best validation loss weights.
+    best_val_loss : float
+        Best validation loss achieved during training.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads pre-trained Multi-Head base model
+    2. Generates base model predictions for training data
+    3. Computes residual errors between predictions and targets
+    4. Trains cross-attention error predictor on these residuals
+    5. Uses CosineAnnealingLR scheduler
+    6. Early stopping based on validation loss
+    7. Exports model to ONNX format
+    8. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Training Multi-Head + Cross-Attention Error Predictor")
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("Architecture:")
+    LOGGER.info("  - Shared context encoder: 7 → 256 → %d", context_dim)
+    LOGGER.info("  - Component encoders: %d → %d (x4)", context_dim, component_dim)
+    LOGGER.info("  - Cross-attention: %d heads", n_attention_heads)
+    LOGGER.info("  - Component decoders: %d → 128 → 1 (x4)", component_dim)
+    LOGGER.info("")
+    LOGGER.info("Hyperparameters:")
+    LOGGER.info("  lr: %.6f", lr)
+    LOGGER.info("  batch_size: %d", batch_size)
+    LOGGER.info("  dropout: %.2f", dropout)
+    LOGGER.info("")
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    base_model_path = model_directory / "multi_head.onnx"
+    params_path = model_directory / "multi_head_normalization_params.npz"
+    cache_file = data_dir / "training_data.npz"
+    # Load base model
+    LOGGER.info("")
+    LOGGER.info("Loading Multi-Head base model from %s...", base_model_path)
+    base_session = ort.InferenceSession(str(base_model_path))
+    params = np.load(params_path, allow_pickle=True)
+    input_params = params["input_params"].item()
+    output_params = params["output_params"].item()
+    # Load training data
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Generate base model predictions
+    LOGGER.info("")
+    LOGGER.info("Generating Multi-Head base model predictions...")
+    X_train_norm = normalize_xyY(X_train, input_params)
+    y_train_norm = normalize_munsell(y_train, output_params)
+    base_pred_train_norm = base_session.run(None, {"xyY": X_train_norm})[0]
+    X_val_norm = normalize_xyY(X_val, input_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    base_pred_val_norm = base_session.run(None, {"xyY": X_val_norm})[0]
+    # Compute errors
+    error_train = y_train_norm - base_pred_train_norm
+    error_val = y_val_norm - base_pred_val_norm
+    LOGGER.info("")
+    LOGGER.info("Base model error statistics (normalized space):")
+    LOGGER.info("  Mean absolute error: %.6f", np.mean(np.abs(error_train)))
+    LOGGER.info("  Std of error: %.6f", np.std(error_train))
+    LOGGER.info("  Max absolute error: %.6f", np.max(np.abs(error_train)))
+    # Create combined input: [xyY_norm, base_prediction_norm]
+    X_train_combined = np.concatenate([X_train_norm, base_pred_train_norm], axis=1)
+    X_val_combined = np.concatenate([X_val_norm, base_pred_val_norm], axis=1)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_combined)
+    error_train_t = torch.FloatTensor(error_train)
+    X_val_t = torch.FloatTensor(X_val_combined)
+    error_val_t = torch.FloatTensor(error_val)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, error_train_t)
+    val_dataset = TensorDataset(X_val_t, error_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = CrossAttentionErrorPredictor(
+        input_dim=7,
+        context_dim=context_dim,
+        component_dim=component_dim,
+        n_attention_heads=n_attention_heads,
+        dropout=dropout,
+    ).to(device)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("")
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    context_params = sum(p.numel() for p in model.context_encoder.parameters())
+    attention_params = sum(p.numel() for p in model.cross_attention.parameters())
+    LOGGER.info("  - Context encoder: %s", f"{context_params:,}")
+    LOGGER.info("  - Cross-attention: %s", f"{attention_params:,}")
+    # Training setup
+    criterion = nn.MSELoss()
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "cross_attention_error_predictor")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    best_state = None
+    patience = 30
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "cross_attention_error_predictor",
+                "context_dim": context_dim,
+                "component_dim": component_dim,
+                "n_attention_heads": n_attention_heads,
+                "dropout": dropout,
+                "learning_rate": lr,
+                "batch_size": batch_size,
+                "epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            # Training
+            model.train()
+            train_loss = 0.0
+            for X_batch, y_batch in train_loader:
+                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+                optimizer.zero_grad()
+                pred = model(X_batch)
+                loss = criterion(pred, y_batch)
+                loss.backward()
+                optimizer.step()
+                train_loss += loss.item() * len(X_batch)
+            train_loss /= len(X_train_t)
+            scheduler.step()
+            # Validation
+            model.eval()
+            val_loss = 0.0
+            with torch.no_grad():
+                for X_batch, y_batch in val_loader:
+                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+                    pred = model(X_batch)
+                    val_loss += criterion(pred, y_batch).item() * len(X_batch)
+            val_loss /= len(X_val_t)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                best_state = copy.deepcopy(model.state_dict())
+                patience_counter = 0
+                LOGGER.info(
+                    "Epoch %03d/%d - Train: %.6f, Val: %.6f (best) - LR: %.6f",
+                    epoch + 1,
+                    epochs,
+                    train_loss,
+                    val_loss,
+                    optimizer.param_groups[0]["lr"],
+                )
+            else:
+                patience_counter += 1
+                if (epoch + 1) % 50 == 0:
+                    LOGGER.info(
+                        "Epoch %03d/%d - Train: %.6f, Val: %.6f",
+                        epoch + 1,
+                        epochs,
+                        train_loss,
+                        val_loss,
+                    )
+            if patience_counter >= patience:
+                LOGGER.info("Early stopping at epoch %d", epoch + 1)
+                break
+        # Load best model
+        model.load_state_dict(best_state)
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        LOGGER.info("")
+        LOGGER.info("Final Results:")
+        LOGGER.info("  Best Val Loss: %.6f", best_val_loss)
+        # Save model
+        model_directory.mkdir(exist_ok=True)
+        checkpoint_path = (
+            model_directory / "multi_head_cross_attention_error_predictor.pth"
+        )
+        torch.save(
+            {
+                "model_state_dict": model.state_dict(),
+                "val_loss": best_val_loss,
+                "hyperparameters": {
+                    "context_dim": context_dim,
+                    "component_dim": component_dim,
+                    "n_attention_heads": n_attention_heads,
+                    "dropout": dropout,
+                    "lr": lr,
+                    "batch_size": batch_size,
+                },
+            },
+            checkpoint_path,
+        )
+        LOGGER.info("")
+        LOGGER.info("Saved checkpoint: %s", checkpoint_path)
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting error predictor to ONNX...")
+        model.eval()
+        model.cpu()
+        dummy_input = torch.randn(1, 7)
+        onnx_path = model_directory / "multi_head_cross_attention_error_predictor.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_path,
+            export_params=True,
+            opset_version=17,
+            input_names=["combined_input"],
+            output_names=["error_correction"],
+            dynamic_axes={
+                "combined_input": {0: "batch_size"},
+                "error_correction": {0: "batch_size"},
+            },
+        )
+        mlflow.log_artifact(str(checkpoint_path))
+        mlflow.log_artifact(str(onnx_path))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("ONNX model saved to: %s", onnx_path)
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+    return model, best_val_loss
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    train_cross_attention_error_predictor(
+        epochs=300,
+        batch_size=1024,
+        lr=0.0005,
+        dropout=0.1,
+        context_dim=512,
+        component_dim=256,
+        n_attention_heads=4,
+    )

learning_munsell/training/from_xyY/train_multi_head_gamma.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+Train multi-head ML model for xyY to Munsell conversion with gamma-corrected Y.
+Experiment: Apply gamma 2.33 to Y before normalization to better align
+with perceptual lightness (Munsell Value scale is perceptually uniform).
+The multi-head architecture has separate heads for each Munsell component,
+so gamma correction on Y should primarily benefit Value prediction without
+negatively impacting Chroma prediction (unlike the single MLP).
+"""
+import logging
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MultiHeadMLPToMunsell
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import MUNSELL_NORMALIZATION_PARAMS, normalize_munsell
+from learning_munsell.utilities.losses import weighted_mse_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+# Gamma value for Y transformation
+GAMMA = 2.33
+def normalize_inputs(
+    X: NDArray, gamma: float = GAMMA
+) -> tuple[NDArray, dict[str, Any]]:
+    """
+    Normalize xyY inputs to [0, 1] range with gamma correction on Y.
+    Parameters
+    ----------
+    X : ndarray
+        xyY values of shape (n, 3) where columns are [x, y, Y].
+    gamma : float
+        Gamma value to apply to Y component.
+    Returns
+    -------
+    ndarray
+        Normalized values with gamma-corrected Y, dtype float32.
+    dict
+        Normalization parameters including gamma value.
+    """
+    # xyY chromaticity and luminance ranges (all [0, 1])
+    x_range = (0.0, 1.0)
+    y_range = (0.0, 1.0)
+    Y_range = (0.0, 1.0)
+    X_norm = X.copy()
+    X_norm[:, 0] = (X[:, 0] - x_range[0]) / (x_range[1] - x_range[0])
+    X_norm[:, 1] = (X[:, 1] - y_range[0]) / (y_range[1] - y_range[0])
+    # Normalize Y first, then apply gamma
+    Y_normalized = (X[:, 2] - Y_range[0]) / (Y_range[1] - Y_range[0])
+    # Clip to avoid numerical issues with negative values
+    Y_normalized = np.clip(Y_normalized, 0, 1)
+    # Apply gamma: Y_gamma = Y^(1/gamma) - this spreads dark values, compresses light
+    X_norm[:, 2] = np.power(Y_normalized, 1.0 / gamma)
+    params = {
+        "x_range": x_range,
+        "y_range": y_range,
+        "Y_range": Y_range,
+        "gamma": gamma,
+    }
+    return X_norm, params
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=5e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train the multi-head model with gamma-corrected Y input.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training and validation data from cache
+    2. Normalizes inputs with gamma correction (gamma=2.33) on Y
+    3. Normalizes Munsell outputs to [0, 1] range
+    4. Trains multi-head MLP with weighted MSE loss
+    5. Uses early stopping based on validation loss
+    6. Exports best model to ONNX format
+    7. Logs metrics and artifacts to MLflow
+    The gamma correction on Y aligns with perceptual lightness. The Munsell
+    Value scale is perceptually uniform, so gamma correction should primarily
+    benefit Value prediction without negatively impacting Chroma prediction.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("ML-Based xyY to Munsell Conversion: Multi-Head Gamma Experiment")
+    LOGGER.info("Gamma = %.2f applied to Y component", GAMMA)
+    LOGGER.info("=" * 80)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Load training data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Training data not found at %s", cache_file)
+        LOGGER.error("Please run 01_generate_training_data.py first")
+        return
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize data with gamma correction
+    X_train_norm, input_params = normalize_inputs(X_train, gamma=GAMMA)
+    X_val_norm, _ = normalize_inputs(X_val, gamma=GAMMA)
+    # Use shared normalization parameters for Munsell outputs
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    LOGGER.info("")
+    LOGGER.info("Input normalization with gamma=%.2f:", GAMMA)
+    LOGGER.info("  Y range after gamma: [%.4f, %.4f]", X_train_norm[:, 2].min(), X_train_norm[:, 2].max())
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_norm)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val_norm)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MultiHeadMLPToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Model architecture:")
+    LOGGER.info("%s", model)
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    criterion = weighted_mse_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", f"multi_head_gamma_{GAMMA}")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "multi_head_gamma",
+                "num_epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "optimizer": "Adam",
+                "criterion": "weighted_mse_loss",
+                "patience": patience,
+                "total_params": total_params,
+                "gamma": GAMMA,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory = PROJECT_ROOT / "models" / "from_xyY"
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "multi_head_gamma_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "input_params": input_params,
+                        "output_params": output_params,
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting model to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 3).to(device)
+        onnx_file = model_directory / "multi_head_gamma.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY_gamma"],
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY_gamma": {0: "batch_size"}, "munsell_spec": {0: "batch_size"}},
+        )
+        # Save normalization parameters (including gamma)
+        params_file = model_directory / "multi_head_gamma_normalization_params.npz"
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("IMPORTANT: Input Y must be gamma-corrected with gamma=%.2f", GAMMA)
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_gamma_sweep.py ADDED Viewed

	@@ -0,0 +1,605 @@

+"""
+Train multi-head ML models with various gamma values to find optimal gamma.
+Sweeps gamma from 1.0 to 3.0 in increments of 0.1 and evaluates each model
+on real Munsell colours using Delta-E CIE2000.
+Supports parallel execution with multiple runs per gamma for averaging.
+"""
+import logging
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import Any
+import numpy as np
+import torch
+from colour import XYZ_to_Lab, xyY_to_XYZ
+from colour.difference import delta_E_CIE2000
+from colour.notation.datasets.munsell import MUNSELL_COLOURS_REAL
+from colour.notation.munsell import (
+    CCS_ILLUMINANT_MUNSELL,
+    munsell_specification_to_xyY,
+)
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MultiHeadMLPToMunsell
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+LOGGER = logging.getLogger(__name__)
+def normalize_inputs(X: NDArray, gamma: float) -> tuple[NDArray, dict[str, Any]]:
+    """
+    Normalize xyY inputs to [0, 1] range with gamma correction on Y.
+    Parameters
+    ----------
+    X : ndarray
+        xyY values of shape (n, 3) where columns are [x, y, Y].
+    gamma : float
+        Gamma value to apply to Y component.
+    Returns
+    -------
+    ndarray
+        Normalized values with gamma-corrected Y, dtype float32.
+    dict
+        Normalization parameters including gamma value.
+    """
+    x_range = (0.0, 1.0)
+    y_range = (0.0, 1.0)
+    Y_range = (0.0, 1.0)
+    X_norm = X.copy()
+    X_norm[:, 0] = (X[:, 0] - x_range[0]) / (x_range[1] - x_range[0])
+    X_norm[:, 1] = (X[:, 1] - y_range[0]) / (y_range[1] - y_range[0])
+    Y_normalized = (X[:, 2] - Y_range[0]) / (Y_range[1] - Y_range[0])
+    Y_normalized = np.clip(Y_normalized, 0, 1)
+    X_norm[:, 2] = np.power(Y_normalized, 1.0 / gamma)
+    params = {
+        "x_range": x_range,
+        "y_range": y_range,
+        "Y_range": Y_range,
+        "gamma": gamma,
+    }
+    return X_norm, params
+def denormalize_output(y_norm: NDArray, params: dict[str, Any]) -> NDArray:
+    """
+    Denormalize Munsell output from [0, 1] to original ranges.
+    Parameters
+    ----------
+    y_norm : ndarray
+        Normalized Munsell values in [0, 1] range.
+    params : dict
+        Normalization parameters containing range information.
+    Returns
+    -------
+    ndarray
+        Denormalized Munsell values in original ranges.
+    """
+    y = np.copy(y_norm)
+    y[..., 0] = (
+        y_norm[..., 0] * (params["hue_range"][1] - params["hue_range"][0])
+        + params["hue_range"][0]
+    )
+    y[..., 1] = (
+        y_norm[..., 1] * (params["value_range"][1] - params["value_range"][0])
+        + params["value_range"][0]
+    )
+    y[..., 2] = (
+        y_norm[..., 2] * (params["chroma_range"][1] - params["chroma_range"][0])
+        + params["chroma_range"][0]
+    )
+    y[..., 3] = (
+        y_norm[..., 3] * (params["code_range"][1] - params["code_range"][0])
+        + params["code_range"][0]
+    )
+    return y
+def weighted_mse_loss(
+    pred: torch.Tensor, target: torch.Tensor, weights: torch.Tensor = None
+) -> torch.Tensor:
+    """
+    Component-wise weighted MSE loss.
+    Parameters
+    ----------
+    pred : Tensor
+        Predicted Munsell values.
+    target : Tensor
+        Ground truth Munsell values.
+    weights : Tensor, optional
+        Component weights [w_hue, w_value, w_chroma, w_code].
+    Returns
+    -------
+    Tensor
+        Weighted mean squared error loss.
+    """
+    if weights is None:
+        weights = torch.tensor([1.0, 1.0, 3.0, 0.5], device=pred.device)
+    mse = (pred - target) ** 2
+    weighted_mse = mse * weights
+    return weighted_mse.mean()
+def clamp_munsell_specification(spec: NDArray) -> NDArray:
+    """
+    Clamp Munsell specification to valid ranges.
+    Parameters
+    ----------
+    spec : ndarray
+        Munsell specification [hue, value, chroma, code].
+    Returns
+    -------
+    ndarray
+        Clamped Munsell specification within valid ranges.
+    """
+    clamped = np.copy(spec)
+    clamped[..., 0] = np.clip(spec[..., 0], 0.5, 10.0)
+    clamped[..., 1] = np.clip(spec[..., 1], 1.0, 9.0)
+    clamped[..., 2] = np.clip(spec[..., 2], 0.0, 50.0)
+    clamped[..., 3] = np.clip(spec[..., 3], 1.0, 10.0)
+    return clamped
+def compute_delta_e(pred: NDArray, reference_Lab: NDArray) -> list[float]:
+    """
+    Compute Delta-E CIE2000 for predicted Munsell specifications.
+    Parameters
+    ----------
+    pred : ndarray
+        Predicted Munsell specifications.
+    reference_Lab : ndarray
+        Reference CIELAB values for comparison.
+    Returns
+    -------
+    list of float
+        Delta-E CIE2000 values for valid predictions.
+    Notes
+    -----
+    Predictions that cannot be converted to valid xyY are skipped.
+    """
+    delta_E_values = []
+    for idx in range(len(pred)):
+        try:
+            ml_spec = clamp_munsell_specification(pred[idx])
+            ml_spec_for_conversion = ml_spec.copy()
+            ml_spec_for_conversion[3] = round(ml_spec[3])
+            ml_xyy = munsell_specification_to_xyY(ml_spec_for_conversion)
+            ml_XYZ = xyY_to_XYZ(ml_xyy)
+            ml_Lab = XYZ_to_Lab(ml_XYZ, CCS_ILLUMINANT_MUNSELL)
+            delta_E = delta_E_CIE2000(reference_Lab[idx], ml_Lab)
+            delta_E_values.append(delta_E)
+        except (RuntimeError, ValueError):
+            continue
+    return delta_E_values
+def train_model(
+    gamma: float,
+    X_train: NDArray,
+    y_train: NDArray,
+    X_val: NDArray,
+    y_val: NDArray,
+    device: torch.device,
+    num_epochs: int = 100,
+    patience: int = 15,
+) -> tuple[nn.Module, dict[str, Any], dict[str, Any], float]:
+    """
+    Train a multi-head model with specified gamma value.
+    Parameters
+    ----------
+    gamma : float
+        Gamma value for Y correction.
+    X_train : ndarray
+        Training inputs (xyY values).
+    y_train : ndarray
+        Training targets (Munsell specifications).
+    X_val : ndarray
+        Validation inputs.
+    y_val : ndarray
+        Validation targets.
+    device : torch.device
+        Device to run training on.
+    num_epochs : int, optional
+        Maximum number of training epochs. Default is 100.
+    patience : int, optional
+        Early stopping patience. Default is 15.
+    Returns
+    -------
+    nn.Module
+        Trained model with best validation loss.
+    dict
+        Input normalization parameters.
+    dict
+        Output normalization parameters.
+    float
+        Best validation loss achieved.
+    """
+    # Normalize data
+    X_train_norm, input_params = normalize_inputs(X_train, gamma=gamma)
+    X_val_norm, _ = normalize_inputs(X_val, gamma=gamma)
+    # Use shared normalization parameters covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to tensors
+    X_train_t = torch.FloatTensor(X_train_norm)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val_norm)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)
+    # Initialize model
+    model = MultiHeadMLPToMunsell().to(device)
+    optimizer = optim.Adam(model.parameters(), lr=5e-4)
+    criterion = weighted_mse_loss
+    best_val_loss = float("inf")
+    patience_counter = 0
+    best_state = None
+    for epoch in range(num_epochs):
+        # Train
+        model.train()
+        for X_batch, y_batch in train_loader:
+            X_batch = X_batch.to(device)
+            y_batch = y_batch.to(device)
+            outputs = model(X_batch)
+            loss = criterion(outputs, y_batch)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        # Validate
+        model.eval()
+        total_val_loss = 0.0
+        with torch.no_grad():
+            for X_batch, y_batch in val_loader:
+                X_batch = X_batch.to(device)
+                y_batch = y_batch.to(device)
+                outputs = model(X_batch)
+                loss = criterion(outputs, y_batch)
+                total_val_loss += loss.item()
+        val_loss = total_val_loss / len(val_loader)
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            patience_counter = 0
+            best_state = model.state_dict().copy()
+        else:
+            patience_counter += 1
+            if patience_counter >= patience:
+                break
+    # Load best state
+    if best_state is not None:
+        model.load_state_dict(best_state)
+    return model, input_params, output_params, best_val_loss
+def evaluate_on_real_munsell(
+    model: nn.Module,
+    input_params: dict[str, Any],
+    output_params: dict[str, Any],
+    xyY_array: NDArray,
+    reference_Lab: NDArray,
+    device: torch.device,
+) -> tuple[float, float]:
+    """
+    Evaluate model on real Munsell colors using Delta-E CIE2000.
+    Parameters
+    ----------
+    model : nn.Module
+        Trained model to evaluate.
+    input_params : dict
+        Input normalization parameters.
+    output_params : dict
+        Output normalization parameters.
+    xyY_array : ndarray
+        Real Munsell xyY values.
+    reference_Lab : ndarray
+        Reference CIELAB values for Delta-E computation.
+    device : torch.device
+        Device to run evaluation on.
+    Returns
+    -------
+    float
+        Mean Delta-E CIE2000.
+    float
+        Median Delta-E CIE2000.
+    """
+    model.eval()
+    gamma = input_params["gamma"]
+    # Normalize inputs
+    X_norm, _ = normalize_inputs(xyY_array, gamma=gamma)
+    X_t = torch.FloatTensor(X_norm).to(device)
+    # Predict
+    with torch.no_grad():
+        pred_norm = model(X_t).cpu().numpy()
+    pred = denormalize_output(pred_norm, output_params)
+    delta_E_values = compute_delta_e(pred, reference_Lab)
+    return np.mean(delta_E_values), np.median(delta_E_values)
+def run_single_trial(
+    gamma: float,
+    run_id: int,
+    X_train: NDArray,
+    y_train: NDArray,
+    X_val: NDArray,
+    y_val: NDArray,
+    xyY_array: NDArray,
+    reference_Lab: NDArray,
+) -> dict[str, Any]:
+    """
+    Run a single training trial for a given gamma value.
+    Parameters
+    ----------
+    gamma : float
+        Gamma value for Y correction.
+    run_id : int
+        Run identifier for this trial.
+    X_train : ndarray
+        Training inputs.
+    y_train : ndarray
+        Training targets.
+    X_val : ndarray
+        Validation inputs.
+    y_val : ndarray
+        Validation targets.
+    xyY_array : ndarray
+        Real Munsell xyY values for evaluation.
+    reference_Lab : ndarray
+        Reference CIELAB values for Delta-E computation.
+    Returns
+    -------
+    dict
+        Results dictionary containing gamma, run_id, val_loss,
+        mean_delta_e, and median_delta_e.
+    Notes
+    -----
+    Uses CPU to avoid MPS multiprocessing issues.
+    """
+    # Each process uses CPU to avoid MPS multiprocessing issues
+    device = torch.device("cpu")
+    model, input_params, output_params, val_loss = train_model(
+        gamma=gamma,
+        X_train=X_train,
+        y_train=y_train,
+        X_val=X_val,
+        y_val=y_val,
+        device=device,
+        num_epochs=100,
+        patience=15,
+    )
+    mean_delta_e, median_delta_e = evaluate_on_real_munsell(
+        model, input_params, output_params, xyY_array, reference_Lab, device
+    )
+    return {
+        "gamma": gamma,
+        "run_id": run_id,
+        "val_loss": val_loss,
+        "mean_delta_e": mean_delta_e,
+        "median_delta_e": median_delta_e,
+    }
+def main() -> None:
+    """
+    Run gamma sweep experiment to find optimal gamma value.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training and validation data from cache
+    2. Loads real Munsell colors for evaluation
+    3. Sweeps gamma values from 1.0 to 3.0 in 0.1 increments
+    4. Trains multiple models per gamma value for averaging
+    5. Evaluates each model on real Munsell colors using Delta-E CIE2000
+    6. Aggregates results and identifies best gamma value
+    7. Saves results to NPZ file for analysis
+    Uses parallel execution with ProcessPoolExecutor for efficiency.
+    Each model is trained with early stopping and evaluated on validation set.
+    """
+    import argparse
+    parser = argparse.ArgumentParser(description="Gamma sweep with averaging")
+    parser.add_argument("--runs", type=int, default=3, help="Number of runs per gamma")
+    parser.add_argument("--workers", type=int, default=4, help="Number of parallel workers")
+    args = parser.parse_args()
+    num_runs = args.runs
+    num_workers = args.workers
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Head Gamma Sweep: Finding Optimal Gamma Value")
+    LOGGER.info("Testing gamma values from 1.0 to 3.0 in increments of 0.1")
+    LOGGER.info("Runs per gamma: %d, Parallel workers: %d", num_runs, num_workers)
+    LOGGER.info("=" * 80)
+    # Load training data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Training data not found at %s", cache_file)
+        return
+    LOGGER.info("\nLoading training data...")
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d, Validation samples: %d", len(X_train), len(X_val))
+    # Load real Munsell data for evaluation
+    LOGGER.info("Loading real Munsell colours for evaluation...")
+    xyY_values = []
+    reference_Lab = []
+    for munsell_spec_tuple, xyY in MUNSELL_COLOURS_REAL:
+        try:
+            xyY_scaled = np.array([xyY[0], xyY[1], xyY[2] / 100.0])
+            XYZ = xyY_to_XYZ(xyY_scaled)
+            Lab = XYZ_to_Lab(XYZ, CCS_ILLUMINANT_MUNSELL)
+            xyY_values.append(xyY_scaled)
+            reference_Lab.append(Lab)
+        except (RuntimeError, ValueError):
+            continue
+    xyY_array = np.array(xyY_values)
+    reference_Lab = np.array(reference_Lab)
+    LOGGER.info("Loaded %d real Munsell colours", len(xyY_array))
+    # Gamma values to test
+    gamma_values = [round(1.0 + i * 0.1, 1) for i in range(21)]  # 1.0 to 3.0
+    # Create all tasks: (gamma, run_id) pairs
+    tasks = [(gamma, run_id) for gamma in gamma_values for run_id in range(num_runs)]
+    total_tasks = len(tasks)
+    LOGGER.info("\n" + "-" * 80)
+    LOGGER.info("Starting gamma sweep: %d total tasks (%d gamma values x %d runs)",
+                total_tasks, len(gamma_values), num_runs)
+    LOGGER.info("-" * 80)
+    all_results = []
+    completed = 0
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {
+            executor.submit(
+                run_single_trial, gamma, run_id,
+                X_train, y_train, X_val, y_val, xyY_array, reference_Lab
+            ): (gamma, run_id)
+            for gamma, run_id in tasks
+        }
+        for future in as_completed(futures):
+            gamma, run_id = futures[future]
+            try:
+                result = future.result()
+                all_results.append(result)
+                completed += 1
+                LOGGER.info(
+                    "[%3d/%3d] gamma=%.1f run=%d: mean_ΔE=%.4f, median_ΔE=%.4f",
+                    completed, total_tasks, gamma, run_id,
+                    result["mean_delta_e"], result["median_delta_e"]
+                )
+            except Exception as e:
+                LOGGER.error("Task failed for gamma=%.1f run=%d: %s", gamma, run_id, e)
+                completed += 1
+    # Aggregate results by gamma (average across runs)
+    aggregated = {}
+    for r in all_results:
+        gamma = r["gamma"]
+        if gamma not in aggregated:
+            aggregated[gamma] = {"val_losses": [], "means": [], "medians": []}
+        aggregated[gamma]["val_losses"].append(r["val_loss"])
+        aggregated[gamma]["means"].append(r["mean_delta_e"])
+        aggregated[gamma]["medians"].append(r["median_delta_e"])
+    results = []
+    for gamma in sorted(aggregated.keys()):
+        agg = aggregated[gamma]
+        results.append({
+            "gamma": gamma,
+            "val_loss": np.mean(agg["val_losses"]),
+            "val_loss_std": np.std(agg["val_losses"]),
+            "mean_delta_e": np.mean(agg["means"]),
+            "mean_delta_e_std": np.std(agg["means"]),
+            "median_delta_e": np.mean(agg["medians"]),
+            "median_delta_e_std": np.std(agg["medians"]),
+            "num_runs": len(agg["means"]),
+        })
+    # Print results
+    LOGGER.info("\n" + "=" * 80)
+    LOGGER.info("GAMMA SWEEP RESULTS (averaged over %d runs)", num_runs)
+    LOGGER.info("=" * 80)
+    LOGGER.info("")
+    LOGGER.info("%-8s %-14s %-14s %-14s", "Gamma", "Val Loss", "Mean ΔE", "Median ΔE")
+    LOGGER.info("-" * 50)
+    for r in results:
+        LOGGER.info(
+            "%-8.1f %-14s %-14s %-14s",
+            r["gamma"],
+            f"{r['val_loss']:.6f}±{r['val_loss_std']:.4f}",
+            f"{r['mean_delta_e']:.4f}±{r['mean_delta_e_std']:.4f}",
+            f"{r['median_delta_e']:.4f}±{r['median_delta_e_std']:.4f}",
+        )
+    # Find best by mean Delta-E
+    best_by_mean = min(results, key=lambda x: x["mean_delta_e"])
+    best_by_median = min(results, key=lambda x: x["median_delta_e"])
+    LOGGER.info("")
+    LOGGER.info("Best gamma by MEAN Delta-E:   %.1f (ΔE = %.4f ± %.4f)",
+                best_by_mean["gamma"], best_by_mean["mean_delta_e"],
+                best_by_mean["mean_delta_e_std"])
+    LOGGER.info("Best gamma by MEDIAN Delta-E: %.1f (ΔE = %.4f ± %.4f)",
+                best_by_median["gamma"], best_by_median["median_delta_e"],
+                best_by_median["median_delta_e_std"])
+    # Save results
+    results_file = PROJECT_ROOT / "models" / "from_xyY" / "gamma_sweep_results_averaged.npz"
+    np.savez(results_file, results=results, all_results=all_results)
+    LOGGER.info("\nResults saved to: %s", results_file)
+    LOGGER.info("\n" + "=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_large.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+Train multi-head ML model on large dataset (2M samples) for xyY to Munsell conversion.
+This script trains on the larger dataset for potentially improved accuracy.
+Uses the same architecture as train_multi_head_mlp.py but with the large dataset.
+"""
+import logging
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MultiHeadMLPToMunsell
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import weighted_mse_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+@click.command()
+@click.option("--epochs", default=300, help="Number of training epochs")
+@click.option("--batch-size", default=2048, help="Batch size for training")
+@click.option("--lr", default=5e-4, help="Learning rate")
+@click.option("--patience", default=30, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train multi-head MLP on large dataset (2M samples) for xyY to Munsell.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training and validation data from large cached .npz file
+    2. Normalizes xyY inputs (already [0,1]) and Munsell outputs to [0,1]
+    3. Creates multi-head MLP with shared encoder and component-specific heads
+    4. Trains with weighted MSE loss (emphasizing chroma)
+    5. Uses Adam optimizer with ReduceLROnPlateau scheduler
+    6. Applies early stopping based on validation loss (patience=30)
+    7. Exports best model to ONNX format
+    8. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Head Model Training on Large Dataset (2M samples)")
+    LOGGER.info("=" * 80)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    LOGGER.info("Using device: %s", device)
+    # Load large training data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data_large.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Large training data not found at %s", cache_file)
+        LOGGER.error("Please run generate_large_training_data.py first")
+        return
+    LOGGER.info("Loading large training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    # Use shared normalization parameters covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders (larger batch size for larger dataset)
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MultiHeadMLPToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Model architecture:")
+    LOGGER.info("%s", model)
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    learning_rate = lr
+    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=10
+    )
+    criterion = weighted_mse_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "multi_head_large")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "multi_head_large",
+                "learning_rate": learning_rate,
+                "batch_size": batch_size,
+                "num_epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+                "train_samples": len(X_train),
+                "val_samples": len(X_val),
+                "dataset": "large_2M",
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory = PROJECT_ROOT / "models" / "from_xyY"
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "multi_head_large_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "output_params": output_params,
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting model to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file, weights_only=False)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 3).to(device)
+        model_directory = PROJECT_ROOT / "models" / "from_xyY"
+        onnx_file = model_directory / "multi_head_large.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY": {0: "batch_size"}, "munsell_spec": {0: "batch_size"}},
+        )
+        params_file = model_directory / "multi_head_large_normalization_params.npz"
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_mlp.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Train multi-head ML model for xyY to Munsell conversion.
+Architecture:
+- Shared encoder: 3 inputs → 512-dim features
+- 4 separate heads (one per component):
+  - Hue head (circular/angular)
+  - Value head (linear lightness)
+  - Chroma head (non-linear saturation - larger capacity)
+  - Code head (discrete categorical)
+This architecture allows each component to learn specialized features
+while sharing the general color space understanding.
+"""
+import logging
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MultiHeadMLPToMunsell
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    XYY_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import weighted_mse_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=5e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train multi-head MLP for xyY to Munsell conversion.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training and validation data from cached .npz file
+    2. Normalizes xyY inputs (already [0,1]) and Munsell outputs to [0,1]
+    3. Creates multi-head MLP with shared encoder and component-specific heads
+    4. Trains with weighted MSE loss (emphasizing chroma)
+    5. Uses Adam optimizer with no learning rate scheduling
+    6. Applies early stopping based on validation loss (patience=20)
+    7. Exports best model to ONNX format
+    8. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("ML-Based xyY to Munsell Conversion: Multi-Head Model Training")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Load training data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Training data not found at %s", cache_file)
+        LOGGER.error("Please run 01_generate_training_data.py first")
+        return
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize outputs (xyY inputs are already in [0, 1] range)
+    # Use shared normalization parameters covering the full Munsell space for generalization
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MultiHeadMLPToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Model architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Count parameters per component
+    encoder_params = sum(p.numel() for p in model.encoder.parameters())
+    hue_params = sum(p.numel() for p in model.hue_head.parameters())
+    value_params = sum(p.numel() for p in model.value_head.parameters())
+    chroma_params = sum(p.numel() for p in model.chroma_head.parameters())
+    code_params = sum(p.numel() for p in model.code_head.parameters())
+    LOGGER.info("  - Shared encoder: %s", f"{encoder_params:,}")
+    LOGGER.info("  - Hue head: %s", f"{hue_params:,}")
+    LOGGER.info("  - Value head: %s", f"{value_params:,}")
+    LOGGER.info("  - Chroma head: %s (WIDER)", f"{chroma_params:,}")
+    LOGGER.info("  - Code head: %s", f"{code_params:,}")
+    # Training setup
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    # Use weighted MSE with default weights
+    weights = torch.tensor([1.0, 1.0, 3.0, 0.5])
+    criterion = lambda pred, target: weighted_mse_loss(pred, target, weights)
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "multi_head")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        # Log parameters
+        mlflow.log_params(
+            {
+                "model": "multi_head",
+                "learning_rate": lr,
+                "batch_size": batch_size,
+                "num_epochs": epochs,
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            # Log to MLflow
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                # Save best model
+                model_directory = PROJECT_ROOT / "models" / "from_xyY"
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "multi_head_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "output_params": output_params,
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        # Log final metrics
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting model to ONNX...")
+        model.eval()
+        # Load best model
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        # Create dummy input
+        dummy_input = torch.randn(1, 3).to(device)
+        # Export
+        onnx_file = model_directory / "multi_head.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["xyY"],
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY": {0: "batch_size"}, "munsell_spec": {0: "batch_size"}},
+        )
+        # Save normalization parameters alongside model
+        params_file = model_directory / "multi_head_normalization_params.npz"
+        input_params = XYY_NORMALIZATION_PARAMS
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        # Log artifacts to MLflow
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("Artifacts logged to MLflow")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_multi_error_predictor.py ADDED Viewed

	@@ -0,0 +1,378 @@

+"""
+Train Multi-Head error predictor for Multi-Head base model.
+Architecture:
+- 4 independent error correction branches (one per component)
+- Each branch: 7 inputs (xyY + base_pred) → encoder → decoder → 1 error output
+- Chroma branch: WIDER (1.5x capacity for hardest component)
+Complete independence matches the Multi-Head base model philosophy.
+"""
+import logging
+from pathlib import Path
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import onnxruntime as ort
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import (
+    ComponentErrorPredictor,
+    MultiHeadErrorPredictorToMunsell,
+)
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import normalize_munsell, normalize_xyY
+from learning_munsell.utilities.losses import precision_focused_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+def load_base_model(
+    model_path: Path, params_path: Path
+) -> tuple[ort.InferenceSession, dict, dict]:
+    """
+    Load Multi-Head base ONNX model and normalization parameters.
+    Parameters
+    ----------
+    model_path : Path
+        Path to Multi-Head base model ONNX file.
+    params_path : Path
+        Path to normalization parameters .npz file.
+    Returns
+    -------
+    session : ort.InferenceSession
+        ONNX Runtime inference session.
+    input_params : dict
+        Input normalization ranges.
+    output_params : dict
+        Output normalization ranges.
+    """
+    session = ort.InferenceSession(str(model_path))
+    params = np.load(params_path, allow_pickle=True)
+    return session, params["input_params"].item(), params["output_params"].item()
+@click.command()
+@click.option(
+    "--base-model",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to Multi-Head base model ONNX file",
+)
+@click.option(
+    "--params",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to normalization params file",
+)
+@click.option(
+    "--epochs",
+    type=int,
+    default=200,
+    help="Number of training epochs",
+)
+@click.option(
+    "--batch-size",
+    type=int,
+    default=1024,
+    help="Batch size for training",
+)
+@click.option(
+    "--lr",
+    type=float,
+    default=3e-4,
+    help="Learning rate",
+)
+@click.option(
+    "--patience",
+    type=int,
+    default=20,
+    help="Patience for early stopping",
+)
+def main(
+    base_model: Path | None,
+    params: Path | None,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    patience: int,
+) -> None:
+    """
+    Train Multi-Head error predictor with 4 independent branches.
+    Parameters
+    ----------
+    base_model : Path or None
+        Path to Multi-Head base model ONNX file. Uses default if None.
+    params : Path or None
+        Path to normalization parameters. Uses default if None.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads pre-trained base model
+    2. Generates base model predictions for training data
+    3. Computes residual errors between predictions and targets
+    4. Trains error predictor on these residuals
+    5. Uses precision-focused loss function
+    6. Learning rate scheduling with ReduceLROnPlateau
+    7. Early stopping based on validation loss
+    8. Exports model to ONNX format
+    9. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Head Error Predictor: 4 Independent Branches")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    # Use provided paths or defaults
+    if base_model is None:
+        base_model = model_directory / "multi_head.onnx"
+    if params is None:
+        params = model_directory / "multi_head_normalization_params.npz"
+    cache_file = data_dir / "training_data.npz"
+    # Load base model
+    LOGGER.info("")
+    LOGGER.info("Loading Multi-Head base model from %s...", base_model)
+    base_session, input_params, output_params = load_base_model(base_model, params)
+    # Load training data
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Generate base model predictions
+    LOGGER.info("")
+    LOGGER.info("Generating Multi-Head base model predictions...")
+    X_train_norm = normalize_xyY(X_train, input_params)
+    y_train_norm = normalize_munsell(y_train, output_params)
+    # Base predictions (normalized)
+    base_pred_train_norm = base_session.run(None, {"xyY": X_train_norm})[0]
+    X_val_norm = normalize_xyY(X_val, input_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    base_pred_val_norm = base_session.run(None, {"xyY": X_val_norm})[0]
+    # Compute errors (in normalized space)
+    error_train = y_train_norm - base_pred_train_norm
+    error_val = y_val_norm - base_pred_val_norm
+    # Statistics
+    LOGGER.info("")
+    LOGGER.info("Multi-Head base model error statistics (normalized space):")
+    LOGGER.info("  Mean absolute error: %.6f", np.mean(np.abs(error_train)))
+    LOGGER.info("  Std of error: %.6f", np.std(error_train))
+    LOGGER.info("  Max absolute error: %.6f", np.max(np.abs(error_train)))
+    # Create combined input: [xyY_norm, base_prediction_norm]
+    X_train_combined = np.concatenate([X_train_norm, base_pred_train_norm], axis=1)
+    X_val_combined = np.concatenate([X_val_norm, base_pred_val_norm], axis=1)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_combined)
+    error_train_t = torch.FloatTensor(error_train)
+    X_val_t = torch.FloatTensor(X_val_combined)
+    error_val_t = torch.FloatTensor(error_val)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, error_train_t)
+    val_dataset = TensorDataset(X_val_t, error_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize Multi-Head error predictor
+    model = MultiHeadErrorPredictorToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Multi-Head error predictor architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Count parameters per branch
+    hue_params = sum(p.numel() for p in model.hue_branch.parameters())
+    value_params = sum(p.numel() for p in model.value_branch.parameters())
+    chroma_params = sum(p.numel() for p in model.chroma_branch.parameters())
+    code_params = sum(p.numel() for p in model.code_branch.parameters())
+    LOGGER.info("  - Hue branch: %s", f"{hue_params:,}")
+    LOGGER.info("  - Value branch: %s", f"{value_params:,}")
+    LOGGER.info("  - Chroma branch: %s (WIDER 1.5x)", f"{chroma_params:,}")
+    LOGGER.info("  - Code branch: %s", f"{code_params:,}")
+    # Training setup with precision-focused loss
+    LOGGER.info("")
+    LOGGER.info("Using precision-focused loss function:")
+    LOGGER.info("  - MSE (weight: 1.0)")
+    LOGGER.info("  - MAE (weight: 0.5)")
+    LOGGER.info("  - Log penalty for small errors (weight: 0.3)")
+    LOGGER.info("  - Huber loss with delta=0.01 (weight: 0.5)")
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    criterion = precision_focused_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "multi_head_multi_error_predictor")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        # Log hyperparameters
+        mlflow.log_params(
+            {
+                "num_epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "weight_decay": 1e-5,
+                "optimizer": "AdamW",
+                "scheduler": "ReduceLROnPlateau",
+                "criterion": "precision_focused_loss",
+                "patience": patience,
+                "total_params": total_params,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            # Update learning rate
+            scheduler.step(val_loss)
+            # Log to MLflow
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                # Save best model
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = (
+                    model_directory / "multi_head_multi_error_predictor_best.pth"
+                )
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        # Log final metrics
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting Multi-Head error predictor to ONNX...")
+        model.eval()
+        # Load best model
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        # Create dummy input (xyY_norm + base_pred_norm = 7 inputs)
+        dummy_input = torch.randn(1, 7).to(device)
+        # Export
+        onnx_file = model_directory / "multi_head_multi_error_predictor.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["combined_input"],
+            output_names=["error_correction"],
+            dynamic_axes={
+                "combined_input": {0: "batch_size"},
+                "error_correction": {0: "batch_size"},
+            },
+        )
+        LOGGER.info("Multi-Head error predictor ONNX model saved to: %s", onnx_file)
+        # Log artifacts
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        # Log model
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_multi_error_predictor_large.py ADDED Viewed

	@@ -0,0 +1,409 @@

+"""
+Train Multi-Head error predictor on large dataset (2M samples).
+Architecture:
+- 4 independent error correction branches (one per component)
+- Each branch: 7 inputs (xyY + base_pred) → encoder → decoder → 1 error output
+- Chroma branch: WIDER (1.5x capacity for hardest component)
+Uses the large dataset for improved model training.
+"""
+import logging
+from pathlib import Path
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import onnxruntime as ort
+import torch
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import (
+    ComponentErrorPredictor,
+    MultiHeadErrorPredictorToMunsell,
+)
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import normalize_xyY, normalize_munsell
+from learning_munsell.utilities.losses import precision_focused_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+def load_base_model(
+    model_path: Path, params_path: Path
+) -> tuple[ort.InferenceSession, dict, dict]:
+    """
+    Load the base ONNX model and normalization parameters.
+    Parameters
+    ----------
+    model_path : Path
+        Path to the ONNX model file.
+    params_path : Path
+        Path to the normalization parameters file (.npz).
+    Returns
+    -------
+    session : ort.InferenceSession
+        ONNX Runtime inference session.
+    input_params : dict
+        Input normalization parameters.
+    output_params : dict
+        Output normalization parameters.
+    """
+    session = ort.InferenceSession(str(model_path))
+    params = np.load(params_path, allow_pickle=True)
+    return session, params["input_params"].item(), params["output_params"].item()
+@click.command()
+@click.option(
+    "--base-model",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to Multi-Head large base model ONNX file",
+)
+@click.option(
+    "--params",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Path to normalization params file",
+)
+@click.option(
+    "--output-suffix",
+    type=str,
+    default="large",
+    help="Suffix for output filenames (default: 'large')",
+)
+@click.option(
+    "--epochs",
+    type=int,
+    default=300,
+    help="Number of training epochs (default: 300)",
+)
+@click.option(
+    "--batch-size",
+    type=int,
+    default=2048,
+    help="Batch size for training (default: 2048)",
+)
+@click.option(
+    "--lr",
+    type=float,
+    default=3e-4,
+    help="Learning rate (default: 3e-4)",
+)
+@click.option(
+    "--patience",
+    type=int,
+    default=30,
+    help="Early stopping patience (default: 30)",
+)
+def main(
+    base_model: Path | None,
+    params: Path | None,
+    output_suffix: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    patience: int,
+) -> None:
+    """
+    Train Multi-Head error predictor on large dataset.
+    This script trains an error predictor on top of the Multi-Head large
+    base model, using the 2M sample dataset for improved accuracy.
+    Parameters
+    ----------
+    base_model : Path, optional
+        Path to the Multi-Head large base model ONNX file.
+        Default: models/from_xyY/multi_head_large.onnx
+    params : Path, optional
+        Path to the normalization parameters file.
+        Default: models/from_xyY/multi_head_large_normalization_params.npz
+    output_suffix : str
+        Suffix for output filenames (default: 'large').
+    Notes
+    -----
+    The training pipeline:
+    1. Loads pre-trained Multi-Head large base model
+    2. Generates base model predictions for training data (in batches)
+    3. Computes residual errors between predictions and targets
+    4. Trains multi-head error predictor on these residuals
+    5. Uses precision-focused loss function
+    6. Learning rate scheduling with ReduceLROnPlateau
+    7. Early stopping based on validation loss
+    8. Exports model to ONNX format
+    9. Logs metrics and artifacts to MLflow
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("Multi-Head Error Predictor: Large Dataset (2M samples)")
+    LOGGER.info("=" * 80)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    LOGGER.info("Using device: %s", device)
+    # Paths
+    model_directory = PROJECT_ROOT / "models" / "from_xyY"
+    data_dir = PROJECT_ROOT / "data"
+    # Use provided paths or defaults for large model
+    if base_model is None:
+        base_model = model_directory / "multi_head_large.onnx"
+    if params is None:
+        params = model_directory / "multi_head_large_normalization_params.npz"
+    cache_file = data_dir / "training_data_large.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Large training data not found at %s", cache_file)
+        LOGGER.error("Please run generate_large_training_data.py first")
+        return
+    if not base_model.exists():
+        LOGGER.error("Error: Multi-Head large base model not found at %s", base_model)
+        LOGGER.error("Please run train_multi_head_large.py first")
+        return
+    # Load base model
+    LOGGER.info("")
+    LOGGER.info("Loading Multi-Head large base model from %s...", base_model)
+    base_session, input_params, output_params = load_base_model(base_model, params)
+    # Load training data
+    LOGGER.info("Loading large training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Generate base model predictions
+    LOGGER.info("")
+    LOGGER.info("Generating Multi-Head large base model predictions...")
+    X_train_norm = normalize_xyY(X_train, input_params)
+    y_train_norm = normalize_munsell(y_train, output_params)
+    # Base predictions (normalized) - process in batches for memory efficiency
+    LOGGER.info("  Processing training set predictions...")
+    inference_batch_size = 50000
+    base_pred_train_norm = []
+    for i in range(0, len(X_train_norm), inference_batch_size):
+        batch = X_train_norm[i : i + inference_batch_size]
+        pred = base_session.run(None, {"xyY": batch})[0]
+        base_pred_train_norm.append(pred)
+    base_pred_train_norm = np.concatenate(base_pred_train_norm, axis=0)
+    X_val_norm = normalize_xyY(X_val, input_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    LOGGER.info("  Processing validation set predictions...")
+    base_pred_val_norm = []
+    for i in range(0, len(X_val_norm), inference_batch_size):
+        batch = X_val_norm[i : i + inference_batch_size]
+        pred = base_session.run(None, {"xyY": batch})[0]
+        base_pred_val_norm.append(pred)
+    base_pred_val_norm = np.concatenate(base_pred_val_norm, axis=0)
+    # Compute errors (in normalized space)
+    error_train = y_train_norm - base_pred_train_norm
+    error_val = y_val_norm - base_pred_val_norm
+    # Statistics
+    LOGGER.info("")
+    LOGGER.info("Multi-Head large base model error statistics (normalized space):")
+    LOGGER.info("  Mean absolute error: %.6f", np.mean(np.abs(error_train)))
+    LOGGER.info("  Std of error: %.6f", np.std(error_train))
+    LOGGER.info("  Max absolute error: %.6f", np.max(np.abs(error_train)))
+    # Create combined input: [xyY_norm, base_prediction_norm]
+    X_train_combined = np.concatenate([X_train_norm, base_pred_train_norm], axis=1)
+    X_val_combined = np.concatenate([X_val_norm, base_pred_val_norm], axis=1)
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_combined)
+    error_train_t = torch.FloatTensor(error_train)
+    X_val_t = torch.FloatTensor(X_val_combined)
+    error_val_t = torch.FloatTensor(error_val)
+    # Create data loaders (larger batch size for large dataset)
+    train_dataset = TensorDataset(X_train_t, error_train_t)
+    val_dataset = TensorDataset(X_val_t, error_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize Multi-Head error predictor
+    model = MultiHeadErrorPredictorToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Multi-Head error predictor architecture:")
+    LOGGER.info("%s", model)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Count parameters per branch
+    hue_params = sum(p.numel() for p in model.hue_branch.parameters())
+    value_params = sum(p.numel() for p in model.value_branch.parameters())
+    chroma_params = sum(p.numel() for p in model.chroma_branch.parameters())
+    code_params = sum(p.numel() for p in model.code_branch.parameters())
+    LOGGER.info("  - Hue branch: %s", f"{hue_params:,}")
+    LOGGER.info("  - Value branch: %s", f"{value_params:,}")
+    LOGGER.info("  - Chroma branch: %s (WIDER 1.5x)", f"{chroma_params:,}")
+    LOGGER.info("  - Code branch: %s", f"{code_params:,}")
+    # Training setup
+    LOGGER.info("")
+    LOGGER.info("Using precision-focused loss function:")
+    LOGGER.info("  - MSE (weight: 1.0)")
+    LOGGER.info("  - MAE (weight: 0.5)")
+    LOGGER.info("  - Log penalty for small errors (weight: 0.3)")
+    LOGGER.info("  - Huber loss with delta=0.01 (weight: 0.5)")
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=10
+    )
+    criterion = precision_focused_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment(
+        "from_xyY", f"multi_head_multi_error_predictor_{output_suffix}"
+    )
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": f"multi_head_multi_error_predictor_{output_suffix}",
+                "num_epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "weight_decay": 1e-5,
+                "optimizer": "AdamW",
+                "scheduler": "ReduceLROnPlateau",
+                "criterion": "precision_focused_loss",
+                "patience": patience,
+                "total_params": total_params,
+                "train_samples": len(X_train),
+                "val_samples": len(X_val),
+                "dataset": "large_2M",
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            scheduler.step(val_loss)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f, LR: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+                optimizer.param_groups[0]["lr"],
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = (
+                    model_directory / f"multi_head_multi_error_predictor_{output_suffix}_best.pth"
+                )
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                        "output_params": output_params,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting Multi-Head error predictor to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file, weights_only=False)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 7).to(device)
+        onnx_file = model_directory / f"multi_head_multi_error_predictor_{output_suffix}.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=15,
+            input_names=["combined_input"],
+            output_names=["error_correction"],
+            dynamic_axes={
+                "combined_input": {0: "batch_size"},
+                "error_correction": {0: "batch_size"},
+            },
+        )
+        LOGGER.info("Multi-Head error predictor ONNX model saved to: %s", onnx_file)
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()

learning_munsell/training/from_xyY/train_multi_head_st2084.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""
+Train multi-head ML model for xyY to Munsell conversion with ST.2084 (PQ) encoded Y.
+Experiment: Apply SMPTE ST.2084 (Perceptual Quantizer) encoding to Y before
+normalization. ST.2084 is designed for perceptual uniformity across a wide
+luminance range, potentially providing better alignment with Munsell Value
+than simple gamma correction.
+The multi-head architecture has separate heads for each Munsell component,
+so PQ encoding on Y should primarily benefit Value prediction without
+negatively impacting Chroma prediction.
+"""
+import logging
+from typing import Any
+import click
+import mlflow
+import mlflow.pytorch
+import numpy as np
+import torch
+from colour.models import eotf_inverse_ST2084
+from numpy.typing import NDArray
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+from learning_munsell import PROJECT_ROOT
+from learning_munsell.models.networks import MultiHeadMLPToMunsell
+from learning_munsell.utilities.common import log_training_epoch, setup_mlflow_experiment
+from learning_munsell.utilities.data import (
+    MUNSELL_NORMALIZATION_PARAMS,
+    normalize_munsell,
+)
+from learning_munsell.utilities.losses import weighted_mse_loss
+from learning_munsell.utilities.training import train_epoch, validate
+LOGGER = logging.getLogger(__name__)
+# Peak luminance for ST.2084 scaling
+# Munsell Y is relative luminance [0, 1], we scale to cd/m² for ST.2084
+# Using 100 cd/m² as reference white (typical SDR display)
+L_P_REFERENCE = 100.0
+def normalize_inputs(
+    X: NDArray, L_p: float = L_P_REFERENCE
+) -> tuple[NDArray, dict[str, Any]]:
+    """
+    Normalize xyY inputs to [0, 1] range with ST.2084 (PQ) encoding on Y.
+    Parameters
+    ----------
+    X : ndarray
+        xyY values of shape (n, 3) where columns are [x, y, Y].
+    L_p : float
+        Peak luminance in cd/m² for ST.2084 scaling.
+    Returns
+    -------
+    ndarray
+        Normalized values with ST.2084-encoded Y, dtype float32.
+    dict
+        Normalization parameters including L_p and encoding type.
+    """
+    # xyY chromaticity and luminance ranges (all [0, 1])
+    x_range = (0.0, 1.0)
+    y_range = (0.0, 1.0)
+    Y_range = (0.0, 1.0)
+    X_norm = X.copy()
+    X_norm[:, 0] = (X[:, 0] - x_range[0]) / (x_range[1] - x_range[0])
+    X_norm[:, 1] = (X[:, 1] - y_range[0]) / (y_range[1] - y_range[0])
+    # Normalize Y first, then apply ST.2084
+    Y_normalized = (X[:, 2] - Y_range[0]) / (Y_range[1] - Y_range[0])
+    # Clip to avoid numerical issues
+    Y_normalized = np.clip(Y_normalized, 0, 1)
+    # Scale to cd/m² and apply ST.2084 inverse EOTF (PQ encoding)
+    # ST.2084 expects absolute luminance in cd/m²
+    Y_cdm2 = Y_normalized * L_p
+    # eotf_inverse_ST2084 returns values in [0, 1] for the 10000 cd/m² range
+    # We use a custom L_p to scale appropriately
+    X_norm[:, 2] = eotf_inverse_ST2084(Y_cdm2, L_p=L_p)
+    params = {
+        "x_range": x_range,
+        "y_range": y_range,
+        "Y_range": Y_range,
+        "encoding": "ST2084",
+        "L_p": L_p,
+    }
+    return X_norm, params
+@click.command()
+@click.option("--epochs", default=200, help="Number of training epochs")
+@click.option("--batch-size", default=1024, help="Batch size for training")
+@click.option("--lr", default=5e-4, help="Learning rate")
+@click.option("--patience", default=20, help="Early stopping patience")
+def main(epochs: int, batch_size: int, lr: float, patience: int) -> None:
+    """
+    Train the multi-head model with ST.2084 (PQ) encoded Y input.
+    Notes
+    -----
+    The training pipeline:
+    1. Loads training and validation data from cache
+    2. Normalizes inputs with ST.2084 (PQ) encoding on Y
+    3. Normalizes Munsell outputs to [0, 1] range
+    4. Trains multi-head MLP with weighted MSE loss
+    5. Uses early stopping based on validation loss
+    6. Exports best model to ONNX format
+    7. Logs metrics and artifacts to MLflow
+    ST.2084 (Perceptual Quantizer) encoding is designed for perceptual
+    uniformity across a wide luminance range, potentially providing better
+    alignment with Munsell Value than simple gamma correction. The multi-head
+    architecture isolates this effect to the Value head without negatively
+    impacting Chroma prediction.
+    """
+    LOGGER.info("=" * 80)
+    LOGGER.info("ML-Based xyY to Munsell Conversion: Multi-Head ST.2084 Experiment")
+    LOGGER.info("ST.2084 (PQ) encoding applied to Y component (L_p=%.0f cd/m²)", L_P_REFERENCE)
+    LOGGER.info("=" * 80)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    LOGGER.info("Using device: %s", device)
+    # Load training data
+    data_dir = PROJECT_ROOT / "data"
+    cache_file = data_dir / "training_data.npz"
+    if not cache_file.exists():
+        LOGGER.error("Error: Training data not found at %s", cache_file)
+        LOGGER.error("Please run 01_generate_training_data.py first")
+        return
+    LOGGER.info("Loading training data from %s...", cache_file)
+    data = np.load(cache_file)
+    X_train = data["X_train"]
+    y_train = data["y_train"]
+    X_val = data["X_val"]
+    y_val = data["y_val"]
+    LOGGER.info("Train samples: %d", len(X_train))
+    LOGGER.info("Validation samples: %d", len(X_val))
+    # Normalize data with ST.2084 encoding
+    X_train_norm, input_params = normalize_inputs(X_train, L_p=L_P_REFERENCE)
+    X_val_norm, _ = normalize_inputs(X_val, L_p=L_P_REFERENCE)
+    output_params = MUNSELL_NORMALIZATION_PARAMS
+    y_train_norm = normalize_munsell(y_train, output_params)
+    y_val_norm = normalize_munsell(y_val, output_params)
+    LOGGER.info("")
+    LOGGER.info("Input normalization with ST.2084 (L_p=%.0f):", L_P_REFERENCE)
+    LOGGER.info("  Y range after ST.2084: [%.4f, %.4f]", X_train_norm[:, 2].min(), X_train_norm[:, 2].max())
+    # Convert to PyTorch tensors
+    X_train_t = torch.FloatTensor(X_train_norm)
+    y_train_t = torch.FloatTensor(y_train_norm)
+    X_val_t = torch.FloatTensor(X_val_norm)
+    y_val_t = torch.FloatTensor(y_val_norm)
+    # Create data loaders
+    train_dataset = TensorDataset(X_train_t, y_train_t)
+    val_dataset = TensorDataset(X_val_t, y_val_t)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+    # Initialize model
+    model = MultiHeadMLPToMunsell().to(device)
+    LOGGER.info("")
+    LOGGER.info("Model architecture:")
+    LOGGER.info("%s", model)
+    total_params = sum(p.numel() for p in model.parameters())
+    LOGGER.info("Total parameters: %s", f"{total_params:,}")
+    # Training setup
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    criterion = weighted_mse_loss
+    # MLflow setup
+    run_name = setup_mlflow_experiment("from_xyY", "multi_head_st2084")
+    LOGGER.info("")
+    LOGGER.info("MLflow run: %s", run_name)
+    # Training loop
+    best_val_loss = float("inf")
+    patience_counter = 0
+    LOGGER.info("")
+    LOGGER.info("Starting training...")
+    with mlflow.start_run(run_name=run_name):
+        mlflow.log_params(
+            {
+                "model": "multi_head_st2084",
+                "num_epochs": epochs,
+                "batch_size": batch_size,
+                "learning_rate": lr,
+                "optimizer": "Adam",
+                "criterion": "weighted_mse_loss",
+                "patience": patience,
+                "total_params": total_params,
+                "encoding": "ST2084",
+                "L_p": L_P_REFERENCE,
+            }
+        )
+        for epoch in range(epochs):
+            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
+            val_loss = validate(model, val_loader, criterion, device)
+            log_training_epoch(
+                epoch, train_loss, val_loss, optimizer.param_groups[0]["lr"]
+            )
+            LOGGER.info(
+                "Epoch %03d/%d - Train Loss: %.6f, Val Loss: %.6f",
+                epoch + 1,
+                epochs,
+                train_loss,
+                val_loss,
+            )
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                model_directory = PROJECT_ROOT / "models" / "from_xyY"
+                model_directory.mkdir(exist_ok=True)
+                checkpoint_file = model_directory / "multi_head_st2084_best.pth"
+                torch.save(
+                    {
+                        "model_state_dict": model.state_dict(),
+                        "input_params": input_params,
+                        "output_params": output_params,
+                        "epoch": epoch,
+                        "val_loss": val_loss,
+                    },
+                    checkpoint_file,
+                )
+                LOGGER.info("  → Saved best model (val_loss: %.6f)", val_loss)
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    LOGGER.info("")
+                    LOGGER.info("Early stopping after %d epochs", epoch + 1)
+                    break
+        mlflow.log_metrics(
+            {
+                "best_val_loss": best_val_loss,
+                "final_epoch": epoch + 1,
+            }
+        )
+        # Export to ONNX
+        LOGGER.info("")
+        LOGGER.info("Exporting model to ONNX...")
+        model.eval()
+        checkpoint = torch.load(checkpoint_file)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        dummy_input = torch.randn(1, 3).to(device)
+        onnx_file = model_directory / "multi_head_st2084.onnx"
+        torch.onnx.export(
+            model,
+            dummy_input,
+            onnx_file,
+            export_params=True,
+            opset_version=17,
+            input_names=["xyY_st2084"],
+            output_names=["munsell_spec"],
+            dynamic_axes={"xyY_st2084": {0: "batch_size"}, "munsell_spec": {0: "batch_size"}},
+        )
+        # Save normalization parameters (including ST.2084 info)
+        params_file = model_directory / "multi_head_st2084_normalization_params.npz"
+        np.savez(
+            params_file,
+            input_params=input_params,
+            output_params=output_params,
+        )
+        LOGGER.info("ONNX model saved to: %s", onnx_file)
+        LOGGER.info("Normalization parameters saved to: %s", params_file)
+        LOGGER.info("IMPORTANT: Input Y must be ST.2084-encoded with L_p=%.0f", L_P_REFERENCE)
+        mlflow.log_artifact(str(checkpoint_file))
+        mlflow.log_artifact(str(onnx_file))
+        mlflow.log_artifact(str(params_file))
+        mlflow.pytorch.log_model(model, "model")
+    LOGGER.info("=" * 80)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True)
+    main()