Upload edit\Qwen3-TTS-test\.venv\Lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\splitting.pyx with huggingface_hub
Browse files
edit//Qwen3-TTS-test//.venv//Lib//site-packages//sklearn//ensemble//_hist_gradient_boosting//splitting.pyx
ADDED
|
@@ -0,0 +1,1201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""This module contains routines and data structures to:
|
| 2 |
+
|
| 3 |
+
- Find the best possible split of a node. For a given node, a split is
|
| 4 |
+
characterized by a feature and a bin.
|
| 5 |
+
- Apply a split to a node, i.e. split the indices of the samples at the node
|
| 6 |
+
into the newly created left and right children.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# Authors: The scikit-learn developers
|
| 10 |
+
# SPDX-License-Identifier: BSD-3-Clause
|
| 11 |
+
|
| 12 |
+
cimport cython
|
| 13 |
+
from cython.parallel import prange
|
| 14 |
+
import numpy as np
|
| 15 |
+
from libc.math cimport INFINITY, ceil
|
| 16 |
+
from libc.stdlib cimport malloc, free, qsort
|
| 17 |
+
from libc.string cimport memcpy
|
| 18 |
+
|
| 19 |
+
from sklearn.utils._typedefs cimport uint8_t
|
| 20 |
+
from sklearn.ensemble._hist_gradient_boosting.common cimport X_BINNED_DTYPE_C
|
| 21 |
+
from sklearn.ensemble._hist_gradient_boosting.common cimport Y_DTYPE_C
|
| 22 |
+
from sklearn.ensemble._hist_gradient_boosting.common cimport hist_struct
|
| 23 |
+
from sklearn.ensemble._hist_gradient_boosting.common cimport BITSET_INNER_DTYPE_C
|
| 24 |
+
from sklearn.ensemble._hist_gradient_boosting.common cimport BITSET_DTYPE_C
|
| 25 |
+
from sklearn.ensemble._hist_gradient_boosting.common cimport MonotonicConstraint
|
| 26 |
+
from sklearn.ensemble._hist_gradient_boosting._bitset cimport init_bitset
|
| 27 |
+
from sklearn.ensemble._hist_gradient_boosting._bitset cimport set_bitset
|
| 28 |
+
from sklearn.ensemble._hist_gradient_boosting._bitset cimport in_bitset
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
cdef struct split_info_struct:
|
| 32 |
+
# Same as the SplitInfo class, but we need a C struct to use it in the
|
| 33 |
+
# nogil sections and to use in arrays.
|
| 34 |
+
Y_DTYPE_C gain
|
| 35 |
+
int feature_idx
|
| 36 |
+
unsigned int bin_idx
|
| 37 |
+
uint8_t missing_go_to_left
|
| 38 |
+
Y_DTYPE_C sum_gradient_left
|
| 39 |
+
Y_DTYPE_C sum_gradient_right
|
| 40 |
+
Y_DTYPE_C sum_hessian_left
|
| 41 |
+
Y_DTYPE_C sum_hessian_right
|
| 42 |
+
unsigned int n_samples_left
|
| 43 |
+
unsigned int n_samples_right
|
| 44 |
+
Y_DTYPE_C value_left
|
| 45 |
+
Y_DTYPE_C value_right
|
| 46 |
+
uint8_t is_categorical
|
| 47 |
+
BITSET_DTYPE_C left_cat_bitset
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# used in categorical splits for sorting categories by increasing values of
|
| 51 |
+
# sum_gradients / sum_hessians
|
| 52 |
+
cdef struct categorical_info:
|
| 53 |
+
X_BINNED_DTYPE_C bin_idx
|
| 54 |
+
Y_DTYPE_C value
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class SplitInfo:
|
| 58 |
+
"""Pure data class to store information about a potential split.
|
| 59 |
+
|
| 60 |
+
Parameters
|
| 61 |
+
----------
|
| 62 |
+
gain : float
|
| 63 |
+
The gain of the split.
|
| 64 |
+
feature_idx : int
|
| 65 |
+
The index of the feature to be split.
|
| 66 |
+
bin_idx : int
|
| 67 |
+
The index of the bin on which the split is made. Should be ignored if
|
| 68 |
+
`is_categorical` is True: `left_cat_bitset` will be used to determine
|
| 69 |
+
the split.
|
| 70 |
+
missing_go_to_left : bool
|
| 71 |
+
Whether missing values should go to the left child. This is used
|
| 72 |
+
whether the split is categorical or not.
|
| 73 |
+
sum_gradient_left : float
|
| 74 |
+
The sum of the gradients of all the samples in the left child.
|
| 75 |
+
sum_hessian_left : float
|
| 76 |
+
The sum of the hessians of all the samples in the left child.
|
| 77 |
+
sum_gradient_right : float
|
| 78 |
+
The sum of the gradients of all the samples in the right child.
|
| 79 |
+
sum_hessian_right : float
|
| 80 |
+
The sum of the hessians of all the samples in the right child.
|
| 81 |
+
n_samples_left : int, default=0
|
| 82 |
+
The number of samples in the left child.
|
| 83 |
+
n_samples_right : int
|
| 84 |
+
The number of samples in the right child.
|
| 85 |
+
is_categorical : bool
|
| 86 |
+
Whether the split is done on a categorical feature.
|
| 87 |
+
left_cat_bitset : ndarray of shape=(8,), dtype=uint32 or None
|
| 88 |
+
Bitset representing the categories that go to the left. This is used
|
| 89 |
+
only when `is_categorical` is True.
|
| 90 |
+
Note that missing values are part of that bitset if there are missing
|
| 91 |
+
values in the training data. For missing values, we rely on that
|
| 92 |
+
bitset for splitting, but at prediction time, we rely on
|
| 93 |
+
missing_go_to_left.
|
| 94 |
+
"""
|
| 95 |
+
def __init__(self, gain, feature_idx, bin_idx,
|
| 96 |
+
missing_go_to_left, sum_gradient_left, sum_hessian_left,
|
| 97 |
+
sum_gradient_right, sum_hessian_right, n_samples_left,
|
| 98 |
+
n_samples_right, value_left, value_right,
|
| 99 |
+
is_categorical, left_cat_bitset):
|
| 100 |
+
self.gain = gain
|
| 101 |
+
self.feature_idx = feature_idx
|
| 102 |
+
self.bin_idx = bin_idx
|
| 103 |
+
self.missing_go_to_left = missing_go_to_left
|
| 104 |
+
self.sum_gradient_left = sum_gradient_left
|
| 105 |
+
self.sum_hessian_left = sum_hessian_left
|
| 106 |
+
self.sum_gradient_right = sum_gradient_right
|
| 107 |
+
self.sum_hessian_right = sum_hessian_right
|
| 108 |
+
self.n_samples_left = n_samples_left
|
| 109 |
+
self.n_samples_right = n_samples_right
|
| 110 |
+
self.value_left = value_left
|
| 111 |
+
self.value_right = value_right
|
| 112 |
+
self.is_categorical = is_categorical
|
| 113 |
+
self.left_cat_bitset = left_cat_bitset
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@cython.final
|
| 117 |
+
cdef class Splitter:
|
| 118 |
+
"""Splitter used to find the best possible split at each node.
|
| 119 |
+
|
| 120 |
+
A split (see SplitInfo) is characterized by a feature and a bin.
|
| 121 |
+
|
| 122 |
+
The Splitter is also responsible for partitioning the samples among the
|
| 123 |
+
leaves of the tree (see split_indices() and the partition attribute).
|
| 124 |
+
|
| 125 |
+
Parameters
|
| 126 |
+
----------
|
| 127 |
+
X_binned : ndarray of int, shape (n_samples, n_features)
|
| 128 |
+
The binned input samples. Must be Fortran-aligned.
|
| 129 |
+
n_bins_non_missing : ndarray, shape (n_features,)
|
| 130 |
+
For each feature, gives the number of bins actually used for
|
| 131 |
+
non-missing values.
|
| 132 |
+
missing_values_bin_idx : uint8
|
| 133 |
+
Index of the bin that is used for missing values. This is the index of
|
| 134 |
+
the last bin and is always equal to max_bins (as passed to the GBDT
|
| 135 |
+
classes), or equivalently to n_bins - 1.
|
| 136 |
+
has_missing_values : ndarray, shape (n_features,)
|
| 137 |
+
Whether missing values were observed in the training data, for each
|
| 138 |
+
feature.
|
| 139 |
+
is_categorical : ndarray of bool of shape (n_features,)
|
| 140 |
+
Indicates categorical features.
|
| 141 |
+
monotonic_cst : ndarray of int of shape (n_features,), dtype=int
|
| 142 |
+
Indicates the monotonic constraint to enforce on each feature.
|
| 143 |
+
- 1: monotonic increase
|
| 144 |
+
- 0: no constraint
|
| 145 |
+
- -1: monotonic decrease
|
| 146 |
+
|
| 147 |
+
Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
|
| 148 |
+
l2_regularization : float
|
| 149 |
+
The L2 regularization parameter.
|
| 150 |
+
min_hessian_to_split : float, default=1e-3
|
| 151 |
+
The minimum sum of hessians needed in each node. Splits that result in
|
| 152 |
+
at least one child having a sum of hessians less than
|
| 153 |
+
min_hessian_to_split are discarded.
|
| 154 |
+
min_samples_leaf : int, default=20
|
| 155 |
+
The minimum number of samples per leaf.
|
| 156 |
+
min_gain_to_split : float, default=0.0
|
| 157 |
+
The minimum gain needed to split a node. Splits with lower gain will
|
| 158 |
+
be ignored.
|
| 159 |
+
hessians_are_constant: bool, default is False
|
| 160 |
+
Whether hessians are constant.
|
| 161 |
+
feature_fraction_per_split : float, default=1
|
| 162 |
+
Proportion of randomly chosen features in each and every node split.
|
| 163 |
+
This is a form of regularization, smaller values make the trees weaker
|
| 164 |
+
learners and might prevent overfitting.
|
| 165 |
+
rng : Generator
|
| 166 |
+
n_threads : int, default=1
|
| 167 |
+
Number of OpenMP threads to use.
|
| 168 |
+
"""
|
| 169 |
+
cdef public:
|
| 170 |
+
const X_BINNED_DTYPE_C [::1, :] X_binned
|
| 171 |
+
unsigned int n_features
|
| 172 |
+
const unsigned int [::1] n_bins_non_missing
|
| 173 |
+
uint8_t missing_values_bin_idx
|
| 174 |
+
const uint8_t [::1] has_missing_values
|
| 175 |
+
const uint8_t [::1] is_categorical
|
| 176 |
+
const signed char [::1] monotonic_cst
|
| 177 |
+
uint8_t hessians_are_constant
|
| 178 |
+
Y_DTYPE_C l2_regularization
|
| 179 |
+
Y_DTYPE_C min_hessian_to_split
|
| 180 |
+
unsigned int min_samples_leaf
|
| 181 |
+
Y_DTYPE_C min_gain_to_split
|
| 182 |
+
Y_DTYPE_C feature_fraction_per_split
|
| 183 |
+
rng
|
| 184 |
+
|
| 185 |
+
unsigned int [::1] partition
|
| 186 |
+
unsigned int [::1] left_indices_buffer
|
| 187 |
+
unsigned int [::1] right_indices_buffer
|
| 188 |
+
int n_threads
|
| 189 |
+
|
| 190 |
+
def __init__(self,
|
| 191 |
+
const X_BINNED_DTYPE_C [::1, :] X_binned,
|
| 192 |
+
const unsigned int [::1] n_bins_non_missing,
|
| 193 |
+
const uint8_t missing_values_bin_idx,
|
| 194 |
+
const uint8_t [::1] has_missing_values,
|
| 195 |
+
const uint8_t [::1] is_categorical,
|
| 196 |
+
const signed char [::1] monotonic_cst,
|
| 197 |
+
Y_DTYPE_C l2_regularization,
|
| 198 |
+
Y_DTYPE_C min_hessian_to_split=1e-3,
|
| 199 |
+
unsigned int min_samples_leaf=20,
|
| 200 |
+
Y_DTYPE_C min_gain_to_split=0.,
|
| 201 |
+
uint8_t hessians_are_constant=False,
|
| 202 |
+
Y_DTYPE_C feature_fraction_per_split=1.0,
|
| 203 |
+
rng=np.random.RandomState(),
|
| 204 |
+
unsigned int n_threads=1):
|
| 205 |
+
|
| 206 |
+
self.X_binned = X_binned
|
| 207 |
+
self.n_features = X_binned.shape[1]
|
| 208 |
+
self.n_bins_non_missing = n_bins_non_missing
|
| 209 |
+
self.missing_values_bin_idx = missing_values_bin_idx
|
| 210 |
+
self.has_missing_values = has_missing_values
|
| 211 |
+
self.is_categorical = is_categorical
|
| 212 |
+
self.monotonic_cst = monotonic_cst
|
| 213 |
+
self.l2_regularization = l2_regularization
|
| 214 |
+
self.min_hessian_to_split = min_hessian_to_split
|
| 215 |
+
self.min_samples_leaf = min_samples_leaf
|
| 216 |
+
self.min_gain_to_split = min_gain_to_split
|
| 217 |
+
self.hessians_are_constant = hessians_are_constant
|
| 218 |
+
self.feature_fraction_per_split = feature_fraction_per_split
|
| 219 |
+
self.rng = rng
|
| 220 |
+
self.n_threads = n_threads
|
| 221 |
+
|
| 222 |
+
# The partition array maps each sample index into the leaves of the
|
| 223 |
+
# tree (a leaf in this context is a node that isn't split yet, not
|
| 224 |
+
# necessarily a 'finalized' leaf). Initially, the root contains all
|
| 225 |
+
# the indices, e.g.:
|
| 226 |
+
# partition = [abcdefghijkl]
|
| 227 |
+
# After a call to split_indices, it may look e.g. like this:
|
| 228 |
+
# partition = [cef|abdghijkl]
|
| 229 |
+
# we have 2 leaves, the left one is at position 0 and the second one at
|
| 230 |
+
# position 3. The order of the samples is irrelevant.
|
| 231 |
+
self.partition = np.arange(X_binned.shape[0], dtype=np.uint32)
|
| 232 |
+
# buffers used in split_indices to support parallel splitting.
|
| 233 |
+
self.left_indices_buffer = np.empty_like(self.partition)
|
| 234 |
+
self.right_indices_buffer = np.empty_like(self.partition)
|
| 235 |
+
|
| 236 |
+
def split_indices(Splitter self, split_info, unsigned int [::1]
|
| 237 |
+
sample_indices):
|
| 238 |
+
"""Split samples into left and right arrays.
|
| 239 |
+
|
| 240 |
+
The split is performed according to the best possible split
|
| 241 |
+
(split_info).
|
| 242 |
+
|
| 243 |
+
Ultimately, this is nothing but a partition of the sample_indices
|
| 244 |
+
array with a given pivot, exactly like a quicksort subroutine.
|
| 245 |
+
|
| 246 |
+
Parameters
|
| 247 |
+
----------
|
| 248 |
+
split_info : SplitInfo
|
| 249 |
+
The SplitInfo of the node to split.
|
| 250 |
+
sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
|
| 251 |
+
The indices of the samples at the node to split. This is a view
|
| 252 |
+
on self.partition, and it is modified inplace by placing the
|
| 253 |
+
indices of the left child at the beginning, and the indices of
|
| 254 |
+
the right child at the end.
|
| 255 |
+
|
| 256 |
+
Returns
|
| 257 |
+
-------
|
| 258 |
+
left_indices : ndarray of int, shape (n_left_samples,)
|
| 259 |
+
The indices of the samples in the left child. This is a view on
|
| 260 |
+
self.partition.
|
| 261 |
+
right_indices : ndarray of int, shape (n_right_samples,)
|
| 262 |
+
The indices of the samples in the right child. This is a view on
|
| 263 |
+
self.partition.
|
| 264 |
+
right_child_position : int
|
| 265 |
+
The position of the right child in ``sample_indices``.
|
| 266 |
+
"""
|
| 267 |
+
# This is a multi-threaded implementation inspired by lightgbm. Here
|
| 268 |
+
# is a quick break down. Let's suppose we want to split a node with 24
|
| 269 |
+
# samples named from a to x. self.partition looks like this (the * are
|
| 270 |
+
# indices in other leaves that we don't care about):
|
| 271 |
+
# partition = [*************abcdefghijklmnopqrstuvwx****************]
|
| 272 |
+
# ^ ^
|
| 273 |
+
# node_position node_position + node.n_samples
|
| 274 |
+
|
| 275 |
+
# Ultimately, we want to reorder the samples inside the boundaries of
|
| 276 |
+
# the leaf (which becomes a node) to now represent the samples in its
|
| 277 |
+
# left and right child. For example:
|
| 278 |
+
# partition = [*************abefilmnopqrtuxcdghjksvw*****************]
|
| 279 |
+
# ^ ^
|
| 280 |
+
# left_child_pos right_child_pos
|
| 281 |
+
# Note that left_child_pos always takes the value of node_position,
|
| 282 |
+
# and right_child_pos = left_child_pos + left_child.n_samples. The
|
| 283 |
+
# order of the samples inside a leaf is irrelevant.
|
| 284 |
+
|
| 285 |
+
# 1. sample_indices is a view on this region a..x. We conceptually
|
| 286 |
+
# divide it into n_threads regions. Each thread will be responsible
|
| 287 |
+
# for its own region. Here is an example with 4 threads:
|
| 288 |
+
# sample_indices = [abcdef|ghijkl|mnopqr|stuvwx]
|
| 289 |
+
# 2. Each thread processes 6 = 24 // 4 entries and maps them into
|
| 290 |
+
# left_indices_buffer or right_indices_buffer. For example, we could
|
| 291 |
+
# have the following mapping ('.' denotes an undefined entry):
|
| 292 |
+
# - left_indices_buffer = [abef..|il....|mnopqr|tux...]
|
| 293 |
+
# - right_indices_buffer = [cd....|ghjk..|......|svw...]
|
| 294 |
+
# 3. We keep track of the start positions of the regions (the '|') in
|
| 295 |
+
# ``offset_in_buffers`` as well as the size of each region. We also
|
| 296 |
+
# keep track of the number of samples put into the left/right child
|
| 297 |
+
# by each thread. Concretely:
|
| 298 |
+
# - left_counts = [4, 2, 6, 3]
|
| 299 |
+
# - right_counts = [2, 4, 0, 3]
|
| 300 |
+
# 4. Finally, we put left/right_indices_buffer back into the
|
| 301 |
+
# sample_indices, without any undefined entries and the partition
|
| 302 |
+
# looks as expected
|
| 303 |
+
# partition = [*************abefilmnopqrtuxcdghjksvw***************]
|
| 304 |
+
|
| 305 |
+
# Note: We here show left/right_indices_buffer as being the same size
|
| 306 |
+
# as sample_indices for simplicity, but in reality they are of the
|
| 307 |
+
# same size as partition.
|
| 308 |
+
|
| 309 |
+
cdef:
|
| 310 |
+
int n_samples = sample_indices.shape[0]
|
| 311 |
+
X_BINNED_DTYPE_C bin_idx = split_info.bin_idx
|
| 312 |
+
uint8_t missing_go_to_left = split_info.missing_go_to_left
|
| 313 |
+
uint8_t missing_values_bin_idx = self.missing_values_bin_idx
|
| 314 |
+
int feature_idx = split_info.feature_idx
|
| 315 |
+
const X_BINNED_DTYPE_C [::1] X_binned = \
|
| 316 |
+
self.X_binned[:, feature_idx]
|
| 317 |
+
unsigned int [::1] left_indices_buffer = self.left_indices_buffer
|
| 318 |
+
unsigned int [::1] right_indices_buffer = self.right_indices_buffer
|
| 319 |
+
uint8_t is_categorical = split_info.is_categorical
|
| 320 |
+
# Cython is unhappy if we set left_cat_bitset to
|
| 321 |
+
# split_info.left_cat_bitset directly, so we need a tmp var
|
| 322 |
+
BITSET_INNER_DTYPE_C [:] cat_bitset_tmp = split_info.left_cat_bitset
|
| 323 |
+
BITSET_DTYPE_C left_cat_bitset
|
| 324 |
+
int n_threads = self.n_threads
|
| 325 |
+
|
| 326 |
+
int [:] sizes = np.full(n_threads, n_samples // n_threads,
|
| 327 |
+
dtype=np.int32)
|
| 328 |
+
int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32)
|
| 329 |
+
int [:] left_counts = np.empty(n_threads, dtype=np.int32)
|
| 330 |
+
int [:] right_counts = np.empty(n_threads, dtype=np.int32)
|
| 331 |
+
int left_count
|
| 332 |
+
int right_count
|
| 333 |
+
int start
|
| 334 |
+
int stop
|
| 335 |
+
int i
|
| 336 |
+
int thread_idx
|
| 337 |
+
int sample_idx
|
| 338 |
+
int right_child_position
|
| 339 |
+
uint8_t turn_left
|
| 340 |
+
int [:] left_offset = np.zeros(n_threads, dtype=np.int32)
|
| 341 |
+
int [:] right_offset = np.zeros(n_threads, dtype=np.int32)
|
| 342 |
+
|
| 343 |
+
# only set left_cat_bitset when is_categorical is True
|
| 344 |
+
if is_categorical:
|
| 345 |
+
left_cat_bitset = &cat_bitset_tmp[0]
|
| 346 |
+
|
| 347 |
+
with nogil:
|
| 348 |
+
for thread_idx in range(n_samples % n_threads):
|
| 349 |
+
sizes[thread_idx] += 1
|
| 350 |
+
|
| 351 |
+
for thread_idx in range(1, n_threads):
|
| 352 |
+
offset_in_buffers[thread_idx] = \
|
| 353 |
+
offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1]
|
| 354 |
+
|
| 355 |
+
# map indices from sample_indices to left/right_indices_buffer
|
| 356 |
+
for thread_idx in prange(n_threads, schedule='static',
|
| 357 |
+
chunksize=1, num_threads=n_threads):
|
| 358 |
+
left_count = 0
|
| 359 |
+
right_count = 0
|
| 360 |
+
|
| 361 |
+
start = offset_in_buffers[thread_idx]
|
| 362 |
+
stop = start + sizes[thread_idx]
|
| 363 |
+
for i in range(start, stop):
|
| 364 |
+
sample_idx = sample_indices[i]
|
| 365 |
+
turn_left = sample_goes_left(
|
| 366 |
+
missing_go_to_left,
|
| 367 |
+
missing_values_bin_idx, bin_idx,
|
| 368 |
+
X_binned[sample_idx], is_categorical,
|
| 369 |
+
left_cat_bitset)
|
| 370 |
+
|
| 371 |
+
if turn_left:
|
| 372 |
+
left_indices_buffer[start + left_count] = sample_idx
|
| 373 |
+
left_count = left_count + 1
|
| 374 |
+
else:
|
| 375 |
+
right_indices_buffer[start + right_count] = sample_idx
|
| 376 |
+
right_count = right_count + 1
|
| 377 |
+
|
| 378 |
+
left_counts[thread_idx] = left_count
|
| 379 |
+
right_counts[thread_idx] = right_count
|
| 380 |
+
|
| 381 |
+
# position of right child = just after the left child
|
| 382 |
+
right_child_position = 0
|
| 383 |
+
for thread_idx in range(n_threads):
|
| 384 |
+
right_child_position += left_counts[thread_idx]
|
| 385 |
+
|
| 386 |
+
# offset of each thread in sample_indices for left and right
|
| 387 |
+
# child, i.e. where each thread will start to write.
|
| 388 |
+
right_offset[0] = right_child_position
|
| 389 |
+
for thread_idx in range(1, n_threads):
|
| 390 |
+
left_offset[thread_idx] = \
|
| 391 |
+
left_offset[thread_idx - 1] + left_counts[thread_idx - 1]
|
| 392 |
+
right_offset[thread_idx] = \
|
| 393 |
+
right_offset[thread_idx - 1] + right_counts[thread_idx - 1]
|
| 394 |
+
|
| 395 |
+
# map indices in left/right_indices_buffer back into
|
| 396 |
+
# sample_indices. This also updates self.partition since
|
| 397 |
+
# sample_indices is a view.
|
| 398 |
+
for thread_idx in prange(n_threads, schedule='static',
|
| 399 |
+
chunksize=1, num_threads=n_threads):
|
| 400 |
+
memcpy(
|
| 401 |
+
&sample_indices[left_offset[thread_idx]],
|
| 402 |
+
&left_indices_buffer[offset_in_buffers[thread_idx]],
|
| 403 |
+
sizeof(unsigned int) * left_counts[thread_idx]
|
| 404 |
+
)
|
| 405 |
+
if right_counts[thread_idx] > 0:
|
| 406 |
+
# If we're splitting the rightmost node of the tree, i.e. the
|
| 407 |
+
# rightmost node in the partition array, and if n_threads >= 2, one
|
| 408 |
+
# might have right_counts[-1] = 0 and right_offset[-1] = len(sample_indices)
|
| 409 |
+
# leading to evaluating
|
| 410 |
+
#
|
| 411 |
+
# &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node]
|
| 412 |
+
# = &partition[n_samples_in_tree]
|
| 413 |
+
#
|
| 414 |
+
# which is an out-of-bounds read access that can cause a segmentation fault.
|
| 415 |
+
# When boundscheck=True, removing this check produces this exception:
|
| 416 |
+
#
|
| 417 |
+
# IndexError: Out of bounds on buffer access
|
| 418 |
+
#
|
| 419 |
+
memcpy(
|
| 420 |
+
&sample_indices[right_offset[thread_idx]],
|
| 421 |
+
&right_indices_buffer[offset_in_buffers[thread_idx]],
|
| 422 |
+
sizeof(unsigned int) * right_counts[thread_idx]
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
return (sample_indices[:right_child_position],
|
| 426 |
+
sample_indices[right_child_position:],
|
| 427 |
+
right_child_position)
|
| 428 |
+
|
| 429 |
+
def find_node_split(
|
| 430 |
+
Splitter self,
|
| 431 |
+
unsigned int n_samples,
|
| 432 |
+
hist_struct [:, ::1] histograms, # IN
|
| 433 |
+
const Y_DTYPE_C sum_gradients,
|
| 434 |
+
const Y_DTYPE_C sum_hessians,
|
| 435 |
+
const Y_DTYPE_C value,
|
| 436 |
+
const Y_DTYPE_C lower_bound=-INFINITY,
|
| 437 |
+
const Y_DTYPE_C upper_bound=INFINITY,
|
| 438 |
+
const unsigned int [:] allowed_features=None,
|
| 439 |
+
):
|
| 440 |
+
"""For each feature, find the best bin to split on at a given node.
|
| 441 |
+
|
| 442 |
+
Return the best split info among all features.
|
| 443 |
+
|
| 444 |
+
Parameters
|
| 445 |
+
----------
|
| 446 |
+
n_samples : int
|
| 447 |
+
The number of samples at the node.
|
| 448 |
+
histograms : ndarray of HISTOGRAM_DTYPE of \
|
| 449 |
+
shape (n_features, max_bins)
|
| 450 |
+
The histograms of the current node.
|
| 451 |
+
sum_gradients : float
|
| 452 |
+
The sum of the gradients for each sample at the node.
|
| 453 |
+
sum_hessians : float
|
| 454 |
+
The sum of the hessians for each sample at the node.
|
| 455 |
+
value : float
|
| 456 |
+
The bounded value of the current node. We directly pass the value
|
| 457 |
+
instead of re-computing it from sum_gradients and sum_hessians,
|
| 458 |
+
because we need to compute the loss and the gain based on the
|
| 459 |
+
*bounded* value: computing the value from
|
| 460 |
+
sum_gradients / sum_hessians would give the unbounded value, and
|
| 461 |
+
the interaction with min_gain_to_split would not be correct
|
| 462 |
+
anymore. Side note: we can't use the lower_bound / upper_bound
|
| 463 |
+
parameters either because these refer to the bounds of the
|
| 464 |
+
children, not the bounds of the current node.
|
| 465 |
+
lower_bound : float
|
| 466 |
+
Lower bound for the children values for respecting the monotonic
|
| 467 |
+
constraints.
|
| 468 |
+
upper_bound : float
|
| 469 |
+
Upper bound for the children values for respecting the monotonic
|
| 470 |
+
constraints.
|
| 471 |
+
allowed_features : None or ndarray, dtype=np.uint32
|
| 472 |
+
Indices of the features that are allowed by interaction constraints to be
|
| 473 |
+
split.
|
| 474 |
+
|
| 475 |
+
Returns
|
| 476 |
+
-------
|
| 477 |
+
best_split_info : SplitInfo
|
| 478 |
+
The info about the best possible split among all features.
|
| 479 |
+
"""
|
| 480 |
+
cdef:
|
| 481 |
+
int feature_idx
|
| 482 |
+
int split_info_idx
|
| 483 |
+
int best_split_info_idx
|
| 484 |
+
int n_allowed_features
|
| 485 |
+
split_info_struct split_info
|
| 486 |
+
split_info_struct * split_infos
|
| 487 |
+
const uint8_t [::1] has_missing_values = self.has_missing_values
|
| 488 |
+
const uint8_t [::1] is_categorical = self.is_categorical
|
| 489 |
+
const signed char [::1] monotonic_cst = self.monotonic_cst
|
| 490 |
+
int n_threads = self.n_threads
|
| 491 |
+
bint has_interaction_cst = False
|
| 492 |
+
Y_DTYPE_C feature_fraction_per_split = self.feature_fraction_per_split
|
| 493 |
+
uint8_t [:] subsample_mask # same as npy_bool
|
| 494 |
+
int n_subsampled_features
|
| 495 |
+
|
| 496 |
+
has_interaction_cst = allowed_features is not None
|
| 497 |
+
if has_interaction_cst:
|
| 498 |
+
n_allowed_features = allowed_features.shape[0]
|
| 499 |
+
else:
|
| 500 |
+
n_allowed_features = self.n_features
|
| 501 |
+
|
| 502 |
+
if feature_fraction_per_split < 1.0:
|
| 503 |
+
# We do all random sampling before the nogil and make sure that we sample
|
| 504 |
+
# exactly n_subsampled_features >= 1 features.
|
| 505 |
+
n_subsampled_features = max(
|
| 506 |
+
1,
|
| 507 |
+
int(ceil(feature_fraction_per_split * n_allowed_features)),
|
| 508 |
+
)
|
| 509 |
+
subsample_mask_arr = np.full(n_allowed_features, False)
|
| 510 |
+
subsample_mask_arr[:n_subsampled_features] = True
|
| 511 |
+
self.rng.shuffle(subsample_mask_arr)
|
| 512 |
+
# https://github.com/numpy/numpy/issues/18273
|
| 513 |
+
subsample_mask = subsample_mask_arr
|
| 514 |
+
|
| 515 |
+
with nogil:
|
| 516 |
+
|
| 517 |
+
split_infos = <split_info_struct *> malloc(
|
| 518 |
+
n_allowed_features * sizeof(split_info_struct))
|
| 519 |
+
|
| 520 |
+
# split_info_idx is index of split_infos of size n_allowed_features.
|
| 521 |
+
# features_idx is the index of the feature column in X.
|
| 522 |
+
for split_info_idx in prange(n_allowed_features, schedule='static',
|
| 523 |
+
num_threads=n_threads):
|
| 524 |
+
if has_interaction_cst:
|
| 525 |
+
feature_idx = allowed_features[split_info_idx]
|
| 526 |
+
else:
|
| 527 |
+
feature_idx = split_info_idx
|
| 528 |
+
|
| 529 |
+
split_infos[split_info_idx].feature_idx = feature_idx
|
| 530 |
+
|
| 531 |
+
# For each feature, find best bin to split on
|
| 532 |
+
# Start with a gain of -1 if no better split is found, that
|
| 533 |
+
# means one of the constraints isn't respected
|
| 534 |
+
# (min_samples_leaf, etc.) and the grower will later turn the
|
| 535 |
+
# node into a leaf.
|
| 536 |
+
split_infos[split_info_idx].gain = -1
|
| 537 |
+
split_infos[split_info_idx].is_categorical = is_categorical[feature_idx]
|
| 538 |
+
|
| 539 |
+
# Note that subsample_mask is indexed by split_info_idx and not by
|
| 540 |
+
# feature_idx because we only need to exclude the same features again
|
| 541 |
+
# and again. We do NOT need to access the features directly by using
|
| 542 |
+
# allowed_features.
|
| 543 |
+
if feature_fraction_per_split < 1.0 and not subsample_mask[split_info_idx]:
|
| 544 |
+
continue
|
| 545 |
+
|
| 546 |
+
if is_categorical[feature_idx]:
|
| 547 |
+
self._find_best_bin_to_split_category(
|
| 548 |
+
feature_idx, has_missing_values[feature_idx],
|
| 549 |
+
histograms, n_samples, sum_gradients, sum_hessians,
|
| 550 |
+
value, monotonic_cst[feature_idx], lower_bound,
|
| 551 |
+
upper_bound, &split_infos[split_info_idx])
|
| 552 |
+
else:
|
| 553 |
+
# We will scan bins from left to right (in all cases), and
|
| 554 |
+
# if there are any missing values, we will also scan bins
|
| 555 |
+
# from right to left. This way, we can consider whichever
|
| 556 |
+
# case yields the best gain: either missing values go to
|
| 557 |
+
# the right (left to right scan) or to the left (right to
|
| 558 |
+
# left case). See algo 3 from the XGBoost paper
|
| 559 |
+
# https://arxiv.org/abs/1603.02754
|
| 560 |
+
# Note: for the categorical features above, this isn't
|
| 561 |
+
# needed since missing values are considered a native
|
| 562 |
+
# category.
|
| 563 |
+
self._find_best_bin_to_split_left_to_right(
|
| 564 |
+
feature_idx, has_missing_values[feature_idx],
|
| 565 |
+
histograms, n_samples, sum_gradients, sum_hessians,
|
| 566 |
+
value, monotonic_cst[feature_idx],
|
| 567 |
+
lower_bound, upper_bound, &split_infos[split_info_idx])
|
| 568 |
+
|
| 569 |
+
if has_missing_values[feature_idx]:
|
| 570 |
+
# We need to explore both directions to check whether
|
| 571 |
+
# sending the nans to the left child would lead to a higher
|
| 572 |
+
# gain
|
| 573 |
+
self._find_best_bin_to_split_right_to_left(
|
| 574 |
+
feature_idx, histograms, n_samples,
|
| 575 |
+
sum_gradients, sum_hessians,
|
| 576 |
+
value, monotonic_cst[feature_idx],
|
| 577 |
+
lower_bound, upper_bound, &split_infos[split_info_idx])
|
| 578 |
+
|
| 579 |
+
# then compute best possible split among all features
|
| 580 |
+
# split_info is set to the best of split_infos
|
| 581 |
+
best_split_info_idx = self._find_best_feature_to_split_helper(
|
| 582 |
+
split_infos, n_allowed_features
|
| 583 |
+
)
|
| 584 |
+
split_info = split_infos[best_split_info_idx]
|
| 585 |
+
|
| 586 |
+
out = SplitInfo(
|
| 587 |
+
split_info.gain,
|
| 588 |
+
split_info.feature_idx,
|
| 589 |
+
split_info.bin_idx,
|
| 590 |
+
split_info.missing_go_to_left,
|
| 591 |
+
split_info.sum_gradient_left,
|
| 592 |
+
split_info.sum_hessian_left,
|
| 593 |
+
split_info.sum_gradient_right,
|
| 594 |
+
split_info.sum_hessian_right,
|
| 595 |
+
split_info.n_samples_left,
|
| 596 |
+
split_info.n_samples_right,
|
| 597 |
+
split_info.value_left,
|
| 598 |
+
split_info.value_right,
|
| 599 |
+
split_info.is_categorical,
|
| 600 |
+
None, # left_cat_bitset will only be set if the split is categorical
|
| 601 |
+
)
|
| 602 |
+
# Only set bitset if the split is categorical
|
| 603 |
+
if split_info.is_categorical:
|
| 604 |
+
out.left_cat_bitset = np.asarray(split_info.left_cat_bitset, dtype=np.uint32)
|
| 605 |
+
|
| 606 |
+
free(split_infos)
|
| 607 |
+
return out
|
| 608 |
+
|
| 609 |
+
cdef int _find_best_feature_to_split_helper(
|
| 610 |
+
self,
|
| 611 |
+
split_info_struct * split_infos, # IN
|
| 612 |
+
int n_allowed_features,
|
| 613 |
+
) noexcept nogil:
|
| 614 |
+
"""Return the index of split_infos with the best feature split."""
|
| 615 |
+
cdef:
|
| 616 |
+
int split_info_idx
|
| 617 |
+
int best_split_info_idx = 0
|
| 618 |
+
|
| 619 |
+
for split_info_idx in range(1, n_allowed_features):
|
| 620 |
+
if (split_infos[split_info_idx].gain > split_infos[best_split_info_idx].gain):
|
| 621 |
+
best_split_info_idx = split_info_idx
|
| 622 |
+
return best_split_info_idx
|
| 623 |
+
|
| 624 |
+
cdef void _find_best_bin_to_split_left_to_right(
|
| 625 |
+
Splitter self,
|
| 626 |
+
unsigned int feature_idx,
|
| 627 |
+
uint8_t has_missing_values,
|
| 628 |
+
const hist_struct [:, ::1] histograms, # IN
|
| 629 |
+
unsigned int n_samples,
|
| 630 |
+
Y_DTYPE_C sum_gradients,
|
| 631 |
+
Y_DTYPE_C sum_hessians,
|
| 632 |
+
Y_DTYPE_C value,
|
| 633 |
+
signed char monotonic_cst,
|
| 634 |
+
Y_DTYPE_C lower_bound,
|
| 635 |
+
Y_DTYPE_C upper_bound,
|
| 636 |
+
split_info_struct * split_info) noexcept nogil: # OUT
|
| 637 |
+
"""Find best bin to split on for a given feature.
|
| 638 |
+
|
| 639 |
+
Splits that do not satisfy the splitting constraints
|
| 640 |
+
(min_gain_to_split, etc.) are discarded here.
|
| 641 |
+
|
| 642 |
+
We scan node from left to right. This version is called whether there
|
| 643 |
+
are missing values or not. If any, missing values are assigned to the
|
| 644 |
+
right node.
|
| 645 |
+
"""
|
| 646 |
+
cdef:
|
| 647 |
+
unsigned int bin_idx
|
| 648 |
+
unsigned int n_samples_left
|
| 649 |
+
unsigned int n_samples_right
|
| 650 |
+
unsigned int n_samples_ = n_samples
|
| 651 |
+
# We set the 'end' variable such that the last non-missing-values
|
| 652 |
+
# bin never goes to the left child (which would result in and
|
| 653 |
+
# empty right child), unless there are missing values, since these
|
| 654 |
+
# would go to the right child.
|
| 655 |
+
unsigned int end = \
|
| 656 |
+
self.n_bins_non_missing[feature_idx] - 1 + has_missing_values
|
| 657 |
+
Y_DTYPE_C sum_hessian_left
|
| 658 |
+
Y_DTYPE_C sum_hessian_right
|
| 659 |
+
Y_DTYPE_C sum_gradient_left
|
| 660 |
+
Y_DTYPE_C sum_gradient_right
|
| 661 |
+
Y_DTYPE_C loss_current_node
|
| 662 |
+
Y_DTYPE_C gain
|
| 663 |
+
uint8_t found_better_split = False
|
| 664 |
+
|
| 665 |
+
Y_DTYPE_C best_sum_hessian_left
|
| 666 |
+
Y_DTYPE_C best_sum_gradient_left
|
| 667 |
+
unsigned int best_bin_idx
|
| 668 |
+
unsigned int best_n_samples_left
|
| 669 |
+
Y_DTYPE_C best_gain = -1
|
| 670 |
+
hist_struct hist
|
| 671 |
+
|
| 672 |
+
sum_gradient_left, sum_hessian_left = 0., 0.
|
| 673 |
+
n_samples_left = 0
|
| 674 |
+
|
| 675 |
+
loss_current_node = _loss_from_value(value, sum_gradients)
|
| 676 |
+
|
| 677 |
+
for bin_idx in range(end):
|
| 678 |
+
hist = histograms[feature_idx, bin_idx]
|
| 679 |
+
n_samples_left += hist.count
|
| 680 |
+
n_samples_right = n_samples_ - n_samples_left
|
| 681 |
+
|
| 682 |
+
if self.hessians_are_constant:
|
| 683 |
+
sum_hessian_left += hist.count
|
| 684 |
+
else:
|
| 685 |
+
sum_hessian_left += \
|
| 686 |
+
hist.sum_hessians
|
| 687 |
+
sum_hessian_right = sum_hessians - sum_hessian_left
|
| 688 |
+
|
| 689 |
+
sum_gradient_left += hist.sum_gradients
|
| 690 |
+
sum_gradient_right = sum_gradients - sum_gradient_left
|
| 691 |
+
|
| 692 |
+
if n_samples_left < self.min_samples_leaf:
|
| 693 |
+
continue
|
| 694 |
+
if n_samples_right < self.min_samples_leaf:
|
| 695 |
+
# won't get any better
|
| 696 |
+
break
|
| 697 |
+
|
| 698 |
+
if sum_hessian_left < self.min_hessian_to_split:
|
| 699 |
+
continue
|
| 700 |
+
if sum_hessian_right < self.min_hessian_to_split:
|
| 701 |
+
# won't get any better (hessians are > 0 since loss is convex)
|
| 702 |
+
break
|
| 703 |
+
|
| 704 |
+
gain = _split_gain(sum_gradient_left, sum_hessian_left,
|
| 705 |
+
sum_gradient_right, sum_hessian_right,
|
| 706 |
+
loss_current_node,
|
| 707 |
+
monotonic_cst,
|
| 708 |
+
lower_bound,
|
| 709 |
+
upper_bound,
|
| 710 |
+
self.l2_regularization)
|
| 711 |
+
|
| 712 |
+
if gain > best_gain and gain > self.min_gain_to_split:
|
| 713 |
+
found_better_split = True
|
| 714 |
+
best_gain = gain
|
| 715 |
+
best_bin_idx = bin_idx
|
| 716 |
+
best_sum_gradient_left = sum_gradient_left
|
| 717 |
+
best_sum_hessian_left = sum_hessian_left
|
| 718 |
+
best_n_samples_left = n_samples_left
|
| 719 |
+
|
| 720 |
+
if found_better_split:
|
| 721 |
+
split_info.gain = best_gain
|
| 722 |
+
split_info.bin_idx = best_bin_idx
|
| 723 |
+
# we scan from left to right so missing values go to the right
|
| 724 |
+
split_info.missing_go_to_left = False
|
| 725 |
+
split_info.sum_gradient_left = best_sum_gradient_left
|
| 726 |
+
split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
|
| 727 |
+
split_info.sum_hessian_left = best_sum_hessian_left
|
| 728 |
+
split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
|
| 729 |
+
split_info.n_samples_left = best_n_samples_left
|
| 730 |
+
split_info.n_samples_right = n_samples - best_n_samples_left
|
| 731 |
+
|
| 732 |
+
# We recompute best values here but it's cheap
|
| 733 |
+
split_info.value_left = compute_node_value(
|
| 734 |
+
split_info.sum_gradient_left, split_info.sum_hessian_left,
|
| 735 |
+
lower_bound, upper_bound, self.l2_regularization)
|
| 736 |
+
|
| 737 |
+
split_info.value_right = compute_node_value(
|
| 738 |
+
split_info.sum_gradient_right, split_info.sum_hessian_right,
|
| 739 |
+
lower_bound, upper_bound, self.l2_regularization)
|
| 740 |
+
|
| 741 |
+
cdef void _find_best_bin_to_split_right_to_left(
|
| 742 |
+
self,
|
| 743 |
+
unsigned int feature_idx,
|
| 744 |
+
const hist_struct [:, ::1] histograms, # IN
|
| 745 |
+
unsigned int n_samples,
|
| 746 |
+
Y_DTYPE_C sum_gradients,
|
| 747 |
+
Y_DTYPE_C sum_hessians,
|
| 748 |
+
Y_DTYPE_C value,
|
| 749 |
+
signed char monotonic_cst,
|
| 750 |
+
Y_DTYPE_C lower_bound,
|
| 751 |
+
Y_DTYPE_C upper_bound,
|
| 752 |
+
split_info_struct * split_info) noexcept nogil: # OUT
|
| 753 |
+
"""Find best bin to split on for a given feature.
|
| 754 |
+
|
| 755 |
+
Splits that do not satisfy the splitting constraints
|
| 756 |
+
(min_gain_to_split, etc.) are discarded here.
|
| 757 |
+
|
| 758 |
+
We scan node from right to left. This version is only called when
|
| 759 |
+
there are missing values. Missing values are assigned to the left
|
| 760 |
+
child.
|
| 761 |
+
|
| 762 |
+
If no missing value are present in the data this method isn't called
|
| 763 |
+
since only calling _find_best_bin_to_split_left_to_right is enough.
|
| 764 |
+
"""
|
| 765 |
+
|
| 766 |
+
cdef:
|
| 767 |
+
unsigned int bin_idx
|
| 768 |
+
unsigned int n_samples_left
|
| 769 |
+
unsigned int n_samples_right
|
| 770 |
+
unsigned int n_samples_ = n_samples
|
| 771 |
+
Y_DTYPE_C sum_hessian_left
|
| 772 |
+
Y_DTYPE_C sum_hessian_right
|
| 773 |
+
Y_DTYPE_C sum_gradient_left
|
| 774 |
+
Y_DTYPE_C sum_gradient_right
|
| 775 |
+
Y_DTYPE_C loss_current_node
|
| 776 |
+
Y_DTYPE_C gain
|
| 777 |
+
unsigned int start = self.n_bins_non_missing[feature_idx] - 2
|
| 778 |
+
uint8_t found_better_split = False
|
| 779 |
+
|
| 780 |
+
Y_DTYPE_C best_sum_hessian_left
|
| 781 |
+
Y_DTYPE_C best_sum_gradient_left
|
| 782 |
+
unsigned int best_bin_idx
|
| 783 |
+
unsigned int best_n_samples_left
|
| 784 |
+
Y_DTYPE_C best_gain = split_info.gain # computed during previous scan
|
| 785 |
+
hist_struct hist
|
| 786 |
+
|
| 787 |
+
sum_gradient_right, sum_hessian_right = 0., 0.
|
| 788 |
+
n_samples_right = 0
|
| 789 |
+
|
| 790 |
+
loss_current_node = _loss_from_value(value, sum_gradients)
|
| 791 |
+
|
| 792 |
+
for bin_idx in range(start, -1, -1):
|
| 793 |
+
hist = histograms[feature_idx, bin_idx + 1]
|
| 794 |
+
n_samples_right += hist.count
|
| 795 |
+
n_samples_left = n_samples_ - n_samples_right
|
| 796 |
+
|
| 797 |
+
if self.hessians_are_constant:
|
| 798 |
+
sum_hessian_right += hist.count
|
| 799 |
+
else:
|
| 800 |
+
sum_hessian_right += \
|
| 801 |
+
hist.sum_hessians
|
| 802 |
+
sum_hessian_left = sum_hessians - sum_hessian_right
|
| 803 |
+
|
| 804 |
+
sum_gradient_right += \
|
| 805 |
+
hist.sum_gradients
|
| 806 |
+
sum_gradient_left = sum_gradients - sum_gradient_right
|
| 807 |
+
|
| 808 |
+
if n_samples_right < self.min_samples_leaf:
|
| 809 |
+
continue
|
| 810 |
+
if n_samples_left < self.min_samples_leaf:
|
| 811 |
+
# won't get any better
|
| 812 |
+
break
|
| 813 |
+
|
| 814 |
+
if sum_hessian_right < self.min_hessian_to_split:
|
| 815 |
+
continue
|
| 816 |
+
if sum_hessian_left < self.min_hessian_to_split:
|
| 817 |
+
# won't get any better (hessians are > 0 since loss is convex)
|
| 818 |
+
break
|
| 819 |
+
|
| 820 |
+
gain = _split_gain(sum_gradient_left, sum_hessian_left,
|
| 821 |
+
sum_gradient_right, sum_hessian_right,
|
| 822 |
+
loss_current_node,
|
| 823 |
+
monotonic_cst,
|
| 824 |
+
lower_bound,
|
| 825 |
+
upper_bound,
|
| 826 |
+
self.l2_regularization)
|
| 827 |
+
|
| 828 |
+
if gain > best_gain and gain > self.min_gain_to_split:
|
| 829 |
+
found_better_split = True
|
| 830 |
+
best_gain = gain
|
| 831 |
+
best_bin_idx = bin_idx
|
| 832 |
+
best_sum_gradient_left = sum_gradient_left
|
| 833 |
+
best_sum_hessian_left = sum_hessian_left
|
| 834 |
+
best_n_samples_left = n_samples_left
|
| 835 |
+
|
| 836 |
+
if found_better_split:
|
| 837 |
+
split_info.gain = best_gain
|
| 838 |
+
split_info.bin_idx = best_bin_idx
|
| 839 |
+
# we scan from right to left so missing values go to the left
|
| 840 |
+
split_info.missing_go_to_left = True
|
| 841 |
+
split_info.sum_gradient_left = best_sum_gradient_left
|
| 842 |
+
split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
|
| 843 |
+
split_info.sum_hessian_left = best_sum_hessian_left
|
| 844 |
+
split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
|
| 845 |
+
split_info.n_samples_left = best_n_samples_left
|
| 846 |
+
split_info.n_samples_right = n_samples - best_n_samples_left
|
| 847 |
+
|
| 848 |
+
# We recompute best values here but it's cheap
|
| 849 |
+
split_info.value_left = compute_node_value(
|
| 850 |
+
split_info.sum_gradient_left, split_info.sum_hessian_left,
|
| 851 |
+
lower_bound, upper_bound, self.l2_regularization)
|
| 852 |
+
|
| 853 |
+
split_info.value_right = compute_node_value(
|
| 854 |
+
split_info.sum_gradient_right, split_info.sum_hessian_right,
|
| 855 |
+
lower_bound, upper_bound, self.l2_regularization)
|
| 856 |
+
|
| 857 |
+
cdef void _find_best_bin_to_split_category(
|
| 858 |
+
self,
|
| 859 |
+
unsigned int feature_idx,
|
| 860 |
+
uint8_t has_missing_values,
|
| 861 |
+
const hist_struct [:, ::1] histograms, # IN
|
| 862 |
+
unsigned int n_samples,
|
| 863 |
+
Y_DTYPE_C sum_gradients,
|
| 864 |
+
Y_DTYPE_C sum_hessians,
|
| 865 |
+
Y_DTYPE_C value,
|
| 866 |
+
char monotonic_cst,
|
| 867 |
+
Y_DTYPE_C lower_bound,
|
| 868 |
+
Y_DTYPE_C upper_bound,
|
| 869 |
+
split_info_struct * split_info) noexcept nogil: # OUT
|
| 870 |
+
"""Find best split for categorical features.
|
| 871 |
+
|
| 872 |
+
Categories are first sorted according to their variance, and then
|
| 873 |
+
a scan is performed as if categories were ordered quantities.
|
| 874 |
+
|
| 875 |
+
Ref: "On Grouping for Maximum Homogeneity", Walter D. Fisher
|
| 876 |
+
"""
|
| 877 |
+
|
| 878 |
+
cdef:
|
| 879 |
+
unsigned int bin_idx
|
| 880 |
+
unsigned int n_bins_non_missing = self.n_bins_non_missing[feature_idx]
|
| 881 |
+
unsigned int missing_values_bin_idx = self.missing_values_bin_idx
|
| 882 |
+
categorical_info * cat_infos
|
| 883 |
+
unsigned int sorted_cat_idx
|
| 884 |
+
unsigned int n_used_bins = 0
|
| 885 |
+
int [2] scan_direction
|
| 886 |
+
int direction = 0
|
| 887 |
+
int best_direction = 0
|
| 888 |
+
unsigned int middle
|
| 889 |
+
unsigned int i
|
| 890 |
+
const hist_struct[::1] feature_hist = histograms[feature_idx, :]
|
| 891 |
+
hist_struct hist
|
| 892 |
+
Y_DTYPE_C sum_gradients_bin
|
| 893 |
+
Y_DTYPE_C sum_hessians_bin
|
| 894 |
+
Y_DTYPE_C loss_current_node
|
| 895 |
+
Y_DTYPE_C sum_gradient_left, sum_hessian_left
|
| 896 |
+
Y_DTYPE_C sum_gradient_right, sum_hessian_right
|
| 897 |
+
unsigned int n_samples_left, n_samples_right
|
| 898 |
+
Y_DTYPE_C gain
|
| 899 |
+
Y_DTYPE_C best_gain = -1.0
|
| 900 |
+
uint8_t found_better_split = False
|
| 901 |
+
Y_DTYPE_C best_sum_hessian_left
|
| 902 |
+
Y_DTYPE_C best_sum_gradient_left
|
| 903 |
+
unsigned int best_n_samples_left
|
| 904 |
+
unsigned int best_cat_infos_thresh
|
| 905 |
+
# Reduces the effect of noises in categorical features,
|
| 906 |
+
# especially for categories with few data. Called cat_smooth in
|
| 907 |
+
# LightGBM. TODO: Make this user adjustable?
|
| 908 |
+
Y_DTYPE_C MIN_CAT_SUPPORT = 10.
|
| 909 |
+
# this is equal to 1 for losses where hessians are constant
|
| 910 |
+
Y_DTYPE_C support_factor = n_samples / sum_hessians
|
| 911 |
+
|
| 912 |
+
# Details on the split finding:
|
| 913 |
+
# We first order categories by their sum_gradients / sum_hessians
|
| 914 |
+
# values, and we exclude categories that don't respect MIN_CAT_SUPPORT
|
| 915 |
+
# from this sorted array. Missing values are treated just like any
|
| 916 |
+
# other category. The low-support categories will always be mapped to
|
| 917 |
+
# the right child. We scan the sorted categories array from left to
|
| 918 |
+
# right and from right to left, and we stop at the middle.
|
| 919 |
+
|
| 920 |
+
# Considering ordered categories A B C D, with E being a low-support
|
| 921 |
+
# category: A B C D
|
| 922 |
+
# ^
|
| 923 |
+
# midpoint
|
| 924 |
+
# The scans will consider the following split-points:
|
| 925 |
+
# * left to right:
|
| 926 |
+
# A - B C D E
|
| 927 |
+
# A B - C D E
|
| 928 |
+
# * right to left:
|
| 929 |
+
# D - A B C E
|
| 930 |
+
# C D - A B E
|
| 931 |
+
|
| 932 |
+
# Note that since we stop at the middle and since low-support
|
| 933 |
+
# categories (E) are always mapped to the right, the following splits
|
| 934 |
+
# aren't considered:
|
| 935 |
+
# A E - B C D
|
| 936 |
+
# D E - A B C
|
| 937 |
+
# Basically, we're forcing E to always be mapped to the child that has
|
| 938 |
+
# *at least half of the categories* (and this child is always the right
|
| 939 |
+
# child, by convention).
|
| 940 |
+
|
| 941 |
+
# Also note that if we scanned in only one direction (e.g. left to
|
| 942 |
+
# right), we would only consider the following splits:
|
| 943 |
+
# A - B C D E
|
| 944 |
+
# A B - C D E
|
| 945 |
+
# A B C - D E
|
| 946 |
+
# and thus we would be missing on D - A B C E and on C D - A B E
|
| 947 |
+
|
| 948 |
+
cat_infos = <categorical_info *> malloc(
|
| 949 |
+
(n_bins_non_missing + has_missing_values) * sizeof(categorical_info))
|
| 950 |
+
|
| 951 |
+
# fill cat_infos while filtering out categories based on MIN_CAT_SUPPORT
|
| 952 |
+
for bin_idx in range(n_bins_non_missing):
|
| 953 |
+
hist = feature_hist[bin_idx]
|
| 954 |
+
if self.hessians_are_constant:
|
| 955 |
+
sum_hessians_bin = hist.count
|
| 956 |
+
else:
|
| 957 |
+
sum_hessians_bin = hist.sum_hessians
|
| 958 |
+
if sum_hessians_bin * support_factor >= MIN_CAT_SUPPORT:
|
| 959 |
+
cat_infos[n_used_bins].bin_idx = bin_idx
|
| 960 |
+
sum_gradients_bin = hist.sum_gradients
|
| 961 |
+
|
| 962 |
+
cat_infos[n_used_bins].value = (
|
| 963 |
+
sum_gradients_bin / (sum_hessians_bin + MIN_CAT_SUPPORT)
|
| 964 |
+
)
|
| 965 |
+
n_used_bins += 1
|
| 966 |
+
|
| 967 |
+
# Also add missing values bin so that nans are considered as a category
|
| 968 |
+
if has_missing_values:
|
| 969 |
+
hist = feature_hist[missing_values_bin_idx]
|
| 970 |
+
if self.hessians_are_constant:
|
| 971 |
+
sum_hessians_bin = hist.count
|
| 972 |
+
else:
|
| 973 |
+
sum_hessians_bin = hist.sum_hessians
|
| 974 |
+
if sum_hessians_bin * support_factor >= MIN_CAT_SUPPORT:
|
| 975 |
+
cat_infos[n_used_bins].bin_idx = missing_values_bin_idx
|
| 976 |
+
sum_gradients_bin = (
|
| 977 |
+
hist.sum_gradients
|
| 978 |
+
)
|
| 979 |
+
|
| 980 |
+
cat_infos[n_used_bins].value = (
|
| 981 |
+
sum_gradients_bin / (sum_hessians_bin + MIN_CAT_SUPPORT)
|
| 982 |
+
)
|
| 983 |
+
n_used_bins += 1
|
| 984 |
+
|
| 985 |
+
# not enough categories to form a split
|
| 986 |
+
if n_used_bins <= 1:
|
| 987 |
+
free(cat_infos)
|
| 988 |
+
return
|
| 989 |
+
|
| 990 |
+
qsort(cat_infos, n_used_bins, sizeof(categorical_info),
|
| 991 |
+
compare_cat_infos)
|
| 992 |
+
|
| 993 |
+
loss_current_node = _loss_from_value(value, sum_gradients)
|
| 994 |
+
|
| 995 |
+
scan_direction[0], scan_direction[1] = 1, -1
|
| 996 |
+
for direction in scan_direction:
|
| 997 |
+
if direction == 1:
|
| 998 |
+
middle = (n_used_bins + 1) // 2
|
| 999 |
+
else:
|
| 1000 |
+
middle = (n_used_bins + 1) // 2 - 1
|
| 1001 |
+
|
| 1002 |
+
# The categories we'll consider will go to the left child
|
| 1003 |
+
sum_gradient_left, sum_hessian_left = 0., 0.
|
| 1004 |
+
n_samples_left = 0
|
| 1005 |
+
|
| 1006 |
+
for i in range(middle):
|
| 1007 |
+
sorted_cat_idx = i if direction == 1 else n_used_bins - 1 - i
|
| 1008 |
+
bin_idx = cat_infos[sorted_cat_idx].bin_idx
|
| 1009 |
+
hist = feature_hist[bin_idx]
|
| 1010 |
+
|
| 1011 |
+
n_samples_left += hist.count
|
| 1012 |
+
n_samples_right = n_samples - n_samples_left
|
| 1013 |
+
|
| 1014 |
+
if self.hessians_are_constant:
|
| 1015 |
+
sum_hessian_left += hist.count
|
| 1016 |
+
else:
|
| 1017 |
+
sum_hessian_left += hist.sum_hessians
|
| 1018 |
+
sum_hessian_right = sum_hessians - sum_hessian_left
|
| 1019 |
+
|
| 1020 |
+
sum_gradient_left += hist.sum_gradients
|
| 1021 |
+
sum_gradient_right = sum_gradients - sum_gradient_left
|
| 1022 |
+
|
| 1023 |
+
if (
|
| 1024 |
+
n_samples_left < self.min_samples_leaf or
|
| 1025 |
+
sum_hessian_left < self.min_hessian_to_split
|
| 1026 |
+
):
|
| 1027 |
+
continue
|
| 1028 |
+
if (
|
| 1029 |
+
n_samples_right < self.min_samples_leaf or
|
| 1030 |
+
sum_hessian_right < self.min_hessian_to_split
|
| 1031 |
+
):
|
| 1032 |
+
break
|
| 1033 |
+
|
| 1034 |
+
gain = _split_gain(sum_gradient_left, sum_hessian_left,
|
| 1035 |
+
sum_gradient_right, sum_hessian_right,
|
| 1036 |
+
loss_current_node, monotonic_cst,
|
| 1037 |
+
lower_bound, upper_bound,
|
| 1038 |
+
self.l2_regularization)
|
| 1039 |
+
if gain > best_gain and gain > self.min_gain_to_split:
|
| 1040 |
+
found_better_split = True
|
| 1041 |
+
best_gain = gain
|
| 1042 |
+
best_cat_infos_thresh = sorted_cat_idx
|
| 1043 |
+
best_sum_gradient_left = sum_gradient_left
|
| 1044 |
+
best_sum_hessian_left = sum_hessian_left
|
| 1045 |
+
best_n_samples_left = n_samples_left
|
| 1046 |
+
best_direction = direction
|
| 1047 |
+
|
| 1048 |
+
if found_better_split:
|
| 1049 |
+
split_info.gain = best_gain
|
| 1050 |
+
|
| 1051 |
+
# split_info.bin_idx is unused for categorical splits: left_cat_bitset
|
| 1052 |
+
# is used instead and set below
|
| 1053 |
+
split_info.bin_idx = 0
|
| 1054 |
+
|
| 1055 |
+
split_info.sum_gradient_left = best_sum_gradient_left
|
| 1056 |
+
split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
|
| 1057 |
+
split_info.sum_hessian_left = best_sum_hessian_left
|
| 1058 |
+
split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
|
| 1059 |
+
split_info.n_samples_left = best_n_samples_left
|
| 1060 |
+
split_info.n_samples_right = n_samples - best_n_samples_left
|
| 1061 |
+
|
| 1062 |
+
# We recompute best values here but it's cheap
|
| 1063 |
+
split_info.value_left = compute_node_value(
|
| 1064 |
+
split_info.sum_gradient_left, split_info.sum_hessian_left,
|
| 1065 |
+
lower_bound, upper_bound, self.l2_regularization)
|
| 1066 |
+
|
| 1067 |
+
split_info.value_right = compute_node_value(
|
| 1068 |
+
split_info.sum_gradient_right, split_info.sum_hessian_right,
|
| 1069 |
+
lower_bound, upper_bound, self.l2_regularization)
|
| 1070 |
+
|
| 1071 |
+
# create bitset with values from best_cat_infos_thresh
|
| 1072 |
+
init_bitset(split_info.left_cat_bitset)
|
| 1073 |
+
if best_direction == 1:
|
| 1074 |
+
for sorted_cat_idx in range(best_cat_infos_thresh + 1):
|
| 1075 |
+
bin_idx = cat_infos[sorted_cat_idx].bin_idx
|
| 1076 |
+
set_bitset(split_info.left_cat_bitset, bin_idx)
|
| 1077 |
+
else:
|
| 1078 |
+
for sorted_cat_idx in range(n_used_bins - 1, best_cat_infos_thresh - 1, -1):
|
| 1079 |
+
bin_idx = cat_infos[sorted_cat_idx].bin_idx
|
| 1080 |
+
set_bitset(split_info.left_cat_bitset, bin_idx)
|
| 1081 |
+
|
| 1082 |
+
if has_missing_values:
|
| 1083 |
+
split_info.missing_go_to_left = in_bitset(
|
| 1084 |
+
split_info.left_cat_bitset, missing_values_bin_idx)
|
| 1085 |
+
|
| 1086 |
+
free(cat_infos)
|
| 1087 |
+
|
| 1088 |
+
|
| 1089 |
+
cdef int compare_cat_infos(const void * a, const void * b) noexcept nogil:
|
| 1090 |
+
return -1 if (<categorical_info *>a).value < (<categorical_info *>b).value else 1
|
| 1091 |
+
|
| 1092 |
+
cdef inline Y_DTYPE_C _split_gain(
|
| 1093 |
+
Y_DTYPE_C sum_gradient_left,
|
| 1094 |
+
Y_DTYPE_C sum_hessian_left,
|
| 1095 |
+
Y_DTYPE_C sum_gradient_right,
|
| 1096 |
+
Y_DTYPE_C sum_hessian_right,
|
| 1097 |
+
Y_DTYPE_C loss_current_node,
|
| 1098 |
+
signed char monotonic_cst,
|
| 1099 |
+
Y_DTYPE_C lower_bound,
|
| 1100 |
+
Y_DTYPE_C upper_bound,
|
| 1101 |
+
Y_DTYPE_C l2_regularization) noexcept nogil:
|
| 1102 |
+
"""Loss reduction
|
| 1103 |
+
|
| 1104 |
+
Compute the reduction in loss after taking a split, compared to keeping
|
| 1105 |
+
the node a leaf of the tree.
|
| 1106 |
+
|
| 1107 |
+
See Equation 7 of:
|
| 1108 |
+
:arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System,
|
| 1109 |
+
<1603.02754>.`
|
| 1110 |
+
"""
|
| 1111 |
+
cdef:
|
| 1112 |
+
Y_DTYPE_C gain
|
| 1113 |
+
Y_DTYPE_C value_left
|
| 1114 |
+
Y_DTYPE_C value_right
|
| 1115 |
+
|
| 1116 |
+
# Compute values of potential left and right children
|
| 1117 |
+
value_left = compute_node_value(sum_gradient_left, sum_hessian_left,
|
| 1118 |
+
lower_bound, upper_bound,
|
| 1119 |
+
l2_regularization)
|
| 1120 |
+
value_right = compute_node_value(sum_gradient_right, sum_hessian_right,
|
| 1121 |
+
lower_bound, upper_bound,
|
| 1122 |
+
l2_regularization)
|
| 1123 |
+
|
| 1124 |
+
if ((monotonic_cst == MonotonicConstraint.POS and value_left > value_right) or
|
| 1125 |
+
(monotonic_cst == MonotonicConstraint.NEG and value_left < value_right)):
|
| 1126 |
+
# don't consider this split since it does not respect the monotonic
|
| 1127 |
+
# constraints. Note that these comparisons need to be done on values
|
| 1128 |
+
# that have already been clipped to take the monotonic constraints into
|
| 1129 |
+
# account (if any).
|
| 1130 |
+
return -1
|
| 1131 |
+
|
| 1132 |
+
gain = loss_current_node
|
| 1133 |
+
gain -= _loss_from_value(value_left, sum_gradient_left)
|
| 1134 |
+
gain -= _loss_from_value(value_right, sum_gradient_right)
|
| 1135 |
+
# Note that for the gain to be correct (and for min_gain_to_split to work
|
| 1136 |
+
# as expected), we need all values to be bounded (current node, left child
|
| 1137 |
+
# and right child).
|
| 1138 |
+
|
| 1139 |
+
return gain
|
| 1140 |
+
|
| 1141 |
+
cdef inline Y_DTYPE_C _loss_from_value(
|
| 1142 |
+
Y_DTYPE_C value,
|
| 1143 |
+
Y_DTYPE_C sum_gradient) noexcept nogil:
|
| 1144 |
+
"""Return loss of a node from its (bounded) value
|
| 1145 |
+
|
| 1146 |
+
See Equation 6 of:
|
| 1147 |
+
:arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System,
|
| 1148 |
+
<1603.02754>.`
|
| 1149 |
+
"""
|
| 1150 |
+
return sum_gradient * value
|
| 1151 |
+
|
| 1152 |
+
cdef inline uint8_t sample_goes_left(
|
| 1153 |
+
uint8_t missing_go_to_left,
|
| 1154 |
+
uint8_t missing_values_bin_idx,
|
| 1155 |
+
X_BINNED_DTYPE_C split_bin_idx,
|
| 1156 |
+
X_BINNED_DTYPE_C bin_value,
|
| 1157 |
+
uint8_t is_categorical,
|
| 1158 |
+
BITSET_DTYPE_C left_cat_bitset) noexcept nogil:
|
| 1159 |
+
"""Helper to decide whether sample should go to left or right child."""
|
| 1160 |
+
|
| 1161 |
+
if is_categorical:
|
| 1162 |
+
# note: if any, missing values are encoded in left_cat_bitset
|
| 1163 |
+
return in_bitset(left_cat_bitset, bin_value)
|
| 1164 |
+
else:
|
| 1165 |
+
return (
|
| 1166 |
+
(
|
| 1167 |
+
missing_go_to_left and
|
| 1168 |
+
bin_value == missing_values_bin_idx
|
| 1169 |
+
)
|
| 1170 |
+
or (
|
| 1171 |
+
bin_value <= split_bin_idx
|
| 1172 |
+
))
|
| 1173 |
+
|
| 1174 |
+
|
| 1175 |
+
cpdef inline Y_DTYPE_C compute_node_value(
|
| 1176 |
+
Y_DTYPE_C sum_gradient,
|
| 1177 |
+
Y_DTYPE_C sum_hessian,
|
| 1178 |
+
Y_DTYPE_C lower_bound,
|
| 1179 |
+
Y_DTYPE_C upper_bound,
|
| 1180 |
+
Y_DTYPE_C l2_regularization) noexcept nogil:
|
| 1181 |
+
"""Compute a node's value.
|
| 1182 |
+
|
| 1183 |
+
The value is capped in the [lower_bound, upper_bound] interval to respect
|
| 1184 |
+
monotonic constraints. Shrinkage is ignored.
|
| 1185 |
+
|
| 1186 |
+
See Equation 5 of:
|
| 1187 |
+
:arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System,
|
| 1188 |
+
<1603.02754>.`
|
| 1189 |
+
"""
|
| 1190 |
+
|
| 1191 |
+
cdef:
|
| 1192 |
+
Y_DTYPE_C value
|
| 1193 |
+
|
| 1194 |
+
value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15)
|
| 1195 |
+
|
| 1196 |
+
if value < lower_bound:
|
| 1197 |
+
value = lower_bound
|
| 1198 |
+
elif value > upper_bound:
|
| 1199 |
+
value = upper_bound
|
| 1200 |
+
|
| 1201 |
+
return value
|